diff --git a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
index ed7f909f9c7..5e3d72be4fa 100644
--- a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
+++ b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
@@ -2168,7 +2168,11 @@ struct driver_t : public c_compatible {
 
             dim_t total_size = size_src_dst + size_stats_ss_tensors;
 
-            dim_t n_chunks = total_size / platform::get_per_core_cache_size(2);
+            // Try to create at least nthr_ chunks for realtime inference
+            const int n_chunks_min = nthr_ <= 4 ? nstl::min(4, nthr_) : 1;
+            const size_t l2_per_core = platform::get_per_core_cache_size(2);
+            dim_t n_chunks
+                    = nstl::max<dim_t>(n_chunks_min, total_size / l2_per_core);
 
             // we prioritize parallelization on N, then S, and finally C
             nthr.N = utils::saturate<dim_t>(1, N_, n_chunks);