Merge branch 'main' into dimitri/17040-single-release-image

tenstorrent · Jan 23, 2025 · 0de3350 · 0de3350
2 parents 5f6cbfc + fb9f208
commit 0de3350
Show file tree

Hide file tree

Showing 25 changed files with 1,414 additions and 157 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,6 @@ endif()
 ############################################
 # Project setup
 ############################################
-include(cmake/compilers.cmake)
 
 list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 include(version)
@@ -79,6 +78,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     endif()
 endif()
 
+include(compilers)
 CHECK_COMPILERS()
 
 if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE IN_LIST CMAKE_CONFIGURATION_TYPES)
@@ -91,6 +91,9 @@ set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DDEBUG")
 set(CMAKE_CXX_FLAGS_CI "-O3 -DDEBUG")
 
+# We're not currently using C++20 modules, so don't bother scanning for them
+set(CMAKE_CXX_SCAN_FOR_MODULES FALSE)
+
 ############################################################################################################################
 # Project Options
 #   The following options and their defaults impact what artifacts get built

diff --git a/models/demos/bert_tiny/tests/test_performance.py b/models/demos/bert_tiny/tests/test_performance.py
@@ -119,7 +119,7 @@ def test_perf_device_bare_metal(batch_size, expected_perf):
     margin = 0.03
 
     if is_wormhole_b0():
-        expected_perf = 3990.0
+        expected_perf = 4114.8
     else:
         expected_perf = 3460.0
 

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -1116,3 +1116,32 @@ def test_transpose_16411(device):
     assert_with_pcc(p_c2, ttnn.to_torch(c2), 0.9999)
     assert_with_pcc(p_c3, ttnn.to_torch(c3), 0.9999)
     assert_with_pcc(p_c4, ttnn.to_torch(c4), 0.9999)
+
+
+@pytest.mark.parametrize("rank", [5])
+@pytest.mark.parametrize("indices", [[0, 1], [0, 2], [0, 3], [0, 4], [1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]])
+@pytest.mark.parametrize("layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT])
+def test_transpose_high_rank(*, device: ttnn.Device, rank: int, indices, layout):
+    torch.manual_seed(2005)
+    ttnn.disable_and_clear_program_cache(device)
+    ttnn.enable_program_cache(device)
+
+    shape = [2] * rank
+
+    a = torch.randn(shape, dtype=torch.bfloat16)
+    b = torch.randn(shape, dtype=torch.bfloat16)
+
+    tt_a = ttnn.from_torch(a, device=device, layout=layout)
+    tt_b = ttnn.from_torch(b, device=device, layout=layout)
+
+    a = a.transpose(*indices)
+    b = b.transpose(*indices)
+
+    tt_a = ttnn.transpose(tt_a, *indices)
+    tt_b = ttnn.transpose(tt_b, *indices)
+
+    output_a = ttnn.to_torch(tt_a)
+    output_b = ttnn.to_torch(tt_b)
+
+    assert torch.allclose(a, output_a)
+    assert torch.allclose(b, output_b)
diff --git a/tests/ttnn/unit_tests/operations/test_permute.py b/tests/ttnn/unit_tests/operations/test_permute.py
@@ -5,6 +5,7 @@
 import pytest
 
 import torch
+import math
 
 import ttnn
 import itertools
@@ -182,13 +183,13 @@ def generate_permutations(N):
         yield perm
 
 
-@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
-@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
 @pytest.mark.parametrize("shape", [(7, 7, 7, 7, 7)])
 @pytest.mark.parametrize("perm", generate_permutations(5))
 @pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG])
 @pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
 def test_permute_5d_width(shape, perm, memory_config, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
     torch.manual_seed(2005)
     input_a = torch.randn(shape)
     torch_output = torch.permute(input_a, perm)
@@ -202,13 +203,13 @@ def test_permute_5d_width(shape, perm, memory_config, dtype, device):
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
-@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
-@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
 @pytest.mark.parametrize("shape", [(3, 65, 3, 3, 65), (1, 6, 256, 20, 50), (6, 20, 50, 1, 256)])
 @pytest.mark.parametrize("perm", [(4, 0, 3, 2, 1), (1, 3, 4, 0, 2), (3, 0, 4, 1, 2)])
 @pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG])
 @pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
 def test_permute_5d_blocked(shape, perm, memory_config, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
     torch.manual_seed(520)
     input_a = torch.randn(shape)
 
@@ -224,8 +225,6 @@ def test_permute_5d_blocked(shape, perm, memory_config, dtype, device):
     assert_with_pcc(torch_output, tt_output, 0.9999)
 
 
-@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
-@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
 def test_permute_nd(device):
     torch.manual_seed(2005)
     torch_tensor = torch.rand((1, 3, 16, 16, 16, 16), dtype=torch.bfloat16)
@@ -400,7 +399,6 @@ def test_permutations_5d_fixed_w(shape, perm, dtype, device):
     assert_with_pcc(torch_output, output_tensor, 0.9999)
 
 
-@pytest.mark.skip("#16575 to_layout from tiled to RM fails on reshape")
 @pytest.mark.parametrize("shape", [[1, 9, 91, 7, 9]])
 @pytest.mark.parametrize("perm", [[0, 3, 4, 1, 2]])
 def test_permute_adversarial(shape, perm, device):
@@ -427,3 +425,115 @@ def test_permute_4d_fixed_w(shape, perm, device):
     torch_output = torch.permute(torch_tensor, perm)
     assert torch_output.shape == output_tensor.shape
     assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+def generate_fixed_no_dim0_dim1_transpose_permutations(N, dim0, dim1):
+    perms_Nd = generate_permutations(N)
+    for perm in perms_Nd:
+        if perm[dim0] != dim1:
+            yield perm
+
+
+@pytest.mark.parametrize("shape", [[7, 7, 7, 17, 17]])
+@pytest.mark.parametrize("perm", [[0, 1, 4, 3, 2]])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
+@pytest.mark.parametrize("pad_value", [35.0, float("-inf"), None])
+def test_permute_5d_yw_padded(shape, perm, dtype, pad_value, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    ttnn_output = ttnn.permute(input_tensor, perm, pad_value=pad_value)
+    output_tensor = ttnn.to_torch(ttnn_output)
+    torch_output = torch.permute(torch_tensor, perm)
+
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+    if pad_value != None:
+        logical_shape = torch_output.shape
+        output_padded = ttnn.from_device(ttnn_output).to_torch()
+        padded_shape = output_padded.shape
+        num_padded_values = torch.prod(torch.tensor(padded_shape)) - torch.prod(torch.tensor(logical_shape))
+        assert torch.sum(output_padded == pad_value) == num_padded_values
+
+
+@pytest.mark.parametrize("shape", [[33, 1, 17, 33, 33]])
+@pytest.mark.parametrize("perm", generate_fixed_no_dim0_dim1_transpose_permutations(5, 4, 3))
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
+def test_permute_5d_yw_permutations(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
+@pytest.mark.parametrize("perm", [[0, 3, 2, 1], [3, 1, 2, 0], [1, 3, 2, 0], [3, 0, 2, 1]])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
+def test_permute_4d_yw_permutations(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
+@pytest.mark.parametrize("perm", [[2, 3, 0, 1], [3, 2, 1, 0], [2, 3, 1, 0], [3, 2, 0, 1]])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
+def test_permute_4d_whyx_permutations(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
+@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 3, 1, 2], [1, 2, 3, 0], [2, 1, 3, 0], [2, 0, 3, 1]])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
+def test_permute_4d_other_permutations(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[33, 1, 17, 33, 33]])
+@pytest.mark.parametrize("perm", [[0, 1, 4, 2, 3], [0, 4, 1, 2, 3], [2, 4, 1, 0, 3], [4, 2, 1, 0, 3]])
+@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
+def test_permute_5d_wyh(shape, perm, dtype, device):
+    if is_grayskull() and dtype == ttnn.float32:
+        pytest.skip("Grayskull doesn't support float32")
+    torch.manual_seed(2005)
+    torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
+    output_tensor = ttnn.permute(input_tensor, perm, pad_value=0.0)
+    output_tensor = ttnn.to_torch(output_tensor)
+    torch_output = torch.permute(torch_tensor, perm)
+    assert torch_output.shape == output_tensor.shape
+    assert_with_pcc(torch_output, output_tensor, 0.9999)
diff --git a/tt-train/cmake/dependencies.cmake b/tt-train/cmake/dependencies.cmake
@@ -58,7 +58,7 @@ CPMAddPackage(NAME magic_enum GITHUB_REPOSITORY Neargye/magic_enum GIT_TAG v0.9.
 # nlohmann/json : https://github.com/nlohmann/json
 ############################################################################################################################
 
-CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.9.1 OPTIONS "JSON_BuildTests OFF")
+CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.11.3 OPTIONS "JSON_BuildTests OFF")
 
 CPMAddPackage(NAME xtl GITHUB_REPOSITORY xtensor-stack/xtl GIT_TAG 0.7.7 OPTIONS "XTL_ENABLE_TESTS OFF")
 

diff --git a/tt-train/sources/examples/linear_regression_ddp/main.cpp b/tt-train/sources/examples/linear_regression_ddp/main.cpp
@@ -86,7 +86,7 @@ int main() {
             optimizer.zero_grad();
             auto output = (*model)(data);
             auto loss = ttml::ops::mse_loss(output, targets);
-            fmt::print("Loss shape: {}\n", loss->get_value().shape());
+            fmt::print("Loss shape: {}\n", loss->get_value().get_shape());
             auto mesh_shape = device->shape();
             ttml::core::MeshToXTensorVariant<float> identity_composer =
                 ttml::core::VectorMeshToXTensor<float>(mesh_shape);

diff --git a/tt-train/sources/examples/sample_app/main.cpp b/tt-train/sources/examples/sample_app/main.cpp
@@ -14,7 +14,7 @@ void print_tensor(const tt::tt_metal::Tensor& tensor) {
     // but we are using TILE layout. The printed format WILL NOT be correct. But good enough for a demo
 
     // Get the shape of the tensor
-    auto shape = tensor.shape();
+    auto shape = tensor.get_shape();
     // compyte the size of the tensor
     size_t size = 1;
     for (size_t i = 0; i < shape.size(); i++) size *= shape[i];

diff --git a/tt-train/sources/ttml/autograd/tensor.cpp b/tt-train/sources/ttml/autograd/tensor.cpp
@@ -109,6 +109,16 @@ void Tensor::set_value(const tt::tt_metal::Tensor& value) {
 }
 
 void Tensor::set_grad(const tt::tt_metal::Tensor& grad) {
+    if (core::is_tensor_initialized(grad)) {
+        auto grad_shape = grad.get_shape();
+        auto value_shape = m_value.get_tensor().get_shape();
+        if (grad_shape != value_shape) {
+            throw std::logic_error(fmt::format(
+                "Shapes of gradients are not equal. Expected: {}, got: {}",
+                m_value.get_tensor().get_shape(),
+                grad.get_shape()));
+        }
+    }
     m_grad = grad;
 }
 

diff --git a/tt-train/sources/ttml/models/gpt2.cpp b/tt-train/sources/ttml/models/gpt2.cpp
@@ -65,9 +65,9 @@ void weights_initialization(Transformer& model) {
     for (auto& [name, tensor_ptr] : params) {
         const auto& tensor = tensor_ptr->get_value();
         if (name.find("weight") != std::string::npos) {
-            init::normal_init(tensor_ptr, tensor.shape(), {0.F, 0.02F});
+            init::normal_init(tensor_ptr, tensor.get_shape(), {0.F, 0.02F});
         } else if (name.find("bias") != std::string::npos) {
-            init::constant_init(tensor_ptr, tensor.shape(), 0.F);
+            init::constant_init(tensor_ptr, tensor.get_shape(), 0.F);
         }
     }
 }

diff --git a/tt-train/sources/ttml/ops/layernorm_op.cpp b/tt-train/sources/ttml/ops/layernorm_op.cpp
@@ -81,7 +81,7 @@ autograd::TensorPtr composite_layernorm(
     auto tensor_shape = tensor->get_value().get_shape();
 
     auto shape = core::create_shape({tensor_shape[0], tensor_shape[1], tensor_shape[2], 1});
-    auto mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().dtype());
+    auto mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().get_dtype());
     ttnn::moreh_mean(
         tensor->get_value(),
         3,  // last dimension
@@ -139,7 +139,7 @@ autograd::TensorPtr composite_layernorm(
         // dnorm.mean(-1, keepdim=True)
         auto dnorm_shape = dtensor_normalized.get_shape();
         auto shape = core::create_shape({dnorm_shape[0], dnorm_shape[1], dnorm_shape[2], 1});
-        auto dnorm_mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().dtype());
+        auto dnorm_mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().get_dtype());
         ttnn::moreh_mean(
             dtensor_normalized,
             /* dim */ 3,

diff --git a/tt-train/sources/ttml/ops/losses.cpp b/tt-train/sources/ttml/ops/losses.cpp
@@ -70,7 +70,7 @@ autograd::TensorPtr nll_loss(
     auto* device = &autograd::ctx().get_device();
     auto divisor = core::empty(ttnn::Shape({1, 1}, {32, 32}), device, prediction->get_value().memory_config());
 
-    auto tensor_shape = prediction->get_value().shape();
+    auto tensor_shape = prediction->get_value().get_shape();
     uint32_t Ndim = tensor_shape[0] * tensor_shape[1] * tensor_shape[2];
     uint32_t Cdim = tensor_shape[3];
     auto reshaped_tensor = ttnn::reshape(prediction->get_value(), ttnn::Shape({Ndim, Cdim}));
@@ -98,7 +98,7 @@ autograd::TensorPtr nll_loss(
             /* ignore_index */ -100,
             /* memory_config */ std::nullopt,
             /* compute_kernel_config */ core::ComputeKernelConfig::precise());
-        grad = ttnn::reshape(grad, prediction->get_value().shape());
+        grad = ttnn::reshape(grad, prediction->get_value().get_shape());
         prediction->add_grad(grad);
     };
     auto links = autograd::get_links(prediction);

diff --git a/tt-train/sources/ttml/ops/unary_ops.cpp b/tt-train/sources/ttml/ops/unary_ops.cpp
@@ -122,7 +122,7 @@ autograd::TensorPtr mean(const autograd::TensorPtr& tensor) {
 }
 
 autograd::TensorPtr broadcast_batch(const autograd::TensorPtr& tensor, uint32_t new_batch_dim) {
-    if (new_batch_dim == 1 || tensor->get_value().shape()[0] == new_batch_dim) {
+    if (new_batch_dim == 1 || tensor->get_value().get_shape()[0] == new_batch_dim) {
         return tensor;
     }
     auto out = ttml::autograd::create_tensor();

diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
@@ -397,17 +397,24 @@ void process_exec_buf_end_d(uint32_t& block_noc_writes_to_clear, uint32_t block_
 
 // Note that for non-paged writes, the number of writes per page is always 1
 // This means each noc_write frees up a page
-template <bool multicast>
 void process_write_linear(
     uint32_t num_mcast_dests, uint32_t& block_noc_writes_to_clear, uint32_t block_next_start_addr[]) {
     volatile tt_l1_ptr CQDispatchCmd* cmd = (volatile tt_l1_ptr CQDispatchCmd*)cmd_ptr;
+    bool multicast = num_mcast_dests > 0;
+    if (not multicast) {
+        num_mcast_dests = 1;
+    }
 
     uint32_t dst_noc = cmd->write_linear.noc_xy_addr;
     uint32_t write_offset_index = cmd->write_linear.write_offset_index;
     uint32_t dst_addr = cmd->write_linear.addr + write_offset[write_offset_index];
     uint32_t length = cmd->write_linear.length;
     uint32_t data_ptr = cmd_ptr + sizeof(CQDispatchCmd);
-    cq_noc_async_write_init_state<CQ_NOC_sNdl, multicast>(0, get_noc_addr_helper(dst_noc, dst_addr));
+    if (multicast) {
+        cq_noc_async_write_init_state<CQ_NOC_sNdl, true>(0, get_noc_addr_helper(dst_noc, dst_addr));
+    } else {
+        cq_noc_async_write_init_state<CQ_NOC_sNdl, false>(0, get_noc_addr_helper(dst_noc, dst_addr));
+    }
 
     while (length != 0) {
         // More data needs to be written, but we've exhausted the CB. Acquire more pages.
@@ -434,11 +441,7 @@ void process_write_linear(
         uint32_t available_data = cb_fence - data_ptr;
         uint32_t xfer_size = length > available_data ? available_data : length;
 
-        if constexpr (multicast) {
-            cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests);
-        } else {
-            cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size);
-        }
+        cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests);
         // Increment counters based on the number of packets that were written
         uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE);
         noc_nonposted_writes_num_issued[noc_index] += num_noc_packets_written;
@@ -454,11 +457,7 @@ void process_write_linear(
 void process_write(uint32_t& block_noc_writes_to_clear, uint32_t block_next_start_addr[]) {
     volatile tt_l1_ptr CQDispatchCmd* cmd = (volatile tt_l1_ptr CQDispatchCmd*)cmd_ptr;
     uint32_t num_mcast_dests = cmd->write_linear.num_mcast_dests;
-    if (num_mcast_dests == 0) {
-        process_write_linear<false>(1, block_noc_writes_to_clear, block_next_start_addr);
-    } else {
-        process_write_linear<true>(num_mcast_dests, block_noc_writes_to_clear, block_next_start_addr);
-    }
+    process_write_linear(num_mcast_dests, block_noc_writes_to_clear, block_next_start_addr);
 }
 
 template <bool is_dram>