diff --git a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp index ed7f909f9c7..5e3d72be4fa 100644 --- a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp +++ b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp @@ -2168,7 +2168,11 @@ struct driver_t : public c_compatible { dim_t total_size = size_src_dst + size_stats_ss_tensors; - dim_t n_chunks = total_size / platform::get_per_core_cache_size(2); + // Try to create at least nthr_ chunks for realtime inference + const int n_chunks_min = nthr_ <= 4 ? nstl::min(4, nthr_) : 1; + const size_t l2_per_core = platform::get_per_core_cache_size(2); + dim_t n_chunks + = nstl::max(n_chunks_min, total_size / l2_per_core); // we prioritize parallelization on N, then S, and finally C nthr.N = utils::saturate(1, N_, n_chunks);