diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 972bc721995..31e3648d336 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -18,6 +18,7 @@ set(PERF_MICROBENCH_TESTS_SRCS routing/test_vc_loopback_tunnel.cpp routing/test_tt_fabric_sanity.cpp routing/test_tt_fabric_multi_hop_sanity.cpp + routing/test_tt_fabric_socket_sanity.cpp noc/test_noc_unicast_vs_multicast_to_single_core_latency.cpp old/matmul/matmul_global_l1.cpp old/matmul/matmul_local_l1.cpp diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp new file mode 100644 index 00000000000..99330aa8047 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "debug/dprint.h" +#include "dataflow_api.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +// clang-format on + +using namespace tt::tt_fabric; + +// seed to re-generate the data and validate against incoming data +constexpr uint32_t prng_seed = get_compile_time_arg_val(0); + +// total data/payload expected +constexpr uint32_t total_data_kb = get_compile_time_arg_val(1); +constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; + +// max packet size to generate mask +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(2); +static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); + +// fabric command +constexpr uint32_t test_command = get_compile_time_arg_val(3); + +// address to start reading from/poll on +constexpr uint32_t target_address = get_compile_time_arg_val(4); + +// atomic increment for the ATOMIC_INC command +constexpr uint32_t atomic_increment = get_compile_time_arg_val(5); + +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); +constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(8); +constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(9); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(10); +constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(11); +constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(12); +constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(13); + +volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = + reinterpret_cast(client_pull_req_buf_addr); +volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +uint64_t xy_local_addr; +socket_reader_state socket_reader; + +tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); + +#define PAYLOAD_MASK (0xFFFF0000) + +void kernel_main() { + uint64_t processed_packet_words = 0, num_packets = 0; + volatile tt_l1_ptr uint32_t* poll_addr; + uint32_t poll_val = 0; + bool async_wr_check_failed = false; + + // parse runtime args + uint32_t dest_device = get_arg_val(0); + + tt_fabric_init(); + + zero_l1_buf(test_results, test_results_size_bytes); + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; + test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + zero_l1_buf( + reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); + test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + test_results[PQ_TEST_MISC_INDEX] = 0xff000002; + zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); + test_results[PQ_TEST_MISC_INDEX] = 0xff000003; + + client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; + client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); + client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; + test_results[PQ_TEST_MISC_INDEX] = 0xff000004; + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(); + + socket_reader.init(data_buffer_start_addr, data_buffer_size_words); + DPRINT << "Socket open on " << dest_device << ENDL(); + test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + + fabric_socket_open( + 3, // the network plane to use for this socket + 2, // Temporal epoch for which the socket is being opened + 1, // Socket Id to open + SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket + SOCKET_DIRECTION_RECV, // Send or Receive + dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. + dest_device & 0xFFFF, + 0 // fabric virtual channel. + ); + test_results[PQ_TEST_MISC_INDEX] = 0xff000006; + + uint32_t loop_count = 0; + uint32_t packet_count = 0; + while (1) { + if (!fvc_req_buf_is_empty(client_pull_req_buf) && fvc_req_valid(client_pull_req_buf)) { + uint32_t req_index = client_pull_req_buf->rdptr.ptr & CHAN_REQ_BUF_SIZE_MASK; + chan_request_entry_t* req = (chan_request_entry_t*)client_pull_req_buf->chan_req + req_index; + pull_request_t* pull_req = &req->pull_request; + if (socket_reader.packet_in_progress == 0) { + DPRINT << "Socket Packet " << packet_count << ENDL(); + } + if (pull_req->flags == FORWARD) { + socket_reader.pull_socket_data(pull_req); + test_results[PQ_TEST_MISC_INDEX] = 0xDD000001; + noc_async_read_barrier(); + update_pull_request_words_cleared(pull_req); + socket_reader.pull_words_in_flight = 0; + socket_reader.push_socket_data(); + } + + if (socket_reader.packet_in_progress == 1 and socket_reader.packet_words_remaining == 0) { + // wait for any pending sockat data writes to finish. + test_results[PQ_TEST_MISC_INDEX] = 0xDD000002; + + noc_async_write_barrier(); + + test_results[PQ_TEST_MISC_INDEX] = 0xDD000003; + // clear the flags field to invalidate pull request slot. + // flags will be set to non-zero by next requestor. + req_buf_advance_rdptr((chan_req_buf*)client_pull_req_buf); + socket_reader.packet_in_progress = 0; + packet_count++; + loop_count = 0; + } + } + test_results[PQ_TEST_MISC_INDEX] = 0xDD400000 | (loop_count & 0xfffff); + + loop_count++; + if (packet_count > 0 and loop_count >= 0x10000) { + DPRINT << "Socket Rx Finished" << packet_count << ENDL(); + break; + } + } + + // write out results + set_64b_result(test_results, processed_packet_words, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); + + if (async_wr_check_failed) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_DATA_MISMATCH; + } else { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; + test_results[PQ_TEST_MISC_INDEX] = 0xff000005; + } +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp index be7520fd4b7..772f78bf6b4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx.cpp @@ -8,8 +8,11 @@ #include "tt_fabric/hw/inc/tt_fabric.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" #include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" // clang-format on +using namespace tt::tt_fabric; + uint32_t src_endpoint_id; // constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); @@ -20,50 +23,51 @@ constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); -// constexpr uint32_t router_x = get_compile_time_arg_val(6); -// constexpr uint32_t router_y = get_compile_time_arg_val(7); -constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); -constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(6); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(7); tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); -constexpr uint32_t prng_seed = get_compile_time_arg_val(10); +constexpr uint32_t prng_seed = get_compile_time_arg_val(8); -constexpr uint32_t total_data_kb = get_compile_time_arg_val(11); +constexpr uint32_t total_data_kb = get_compile_time_arg_val(9); constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; -constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(12); +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(10); static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); -constexpr uint32_t timeout_cycles = get_compile_time_arg_val(13); +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(11); -constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(14); +constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(12); constexpr pkt_dest_size_choices_t pkt_dest_size_choice = - static_cast(get_compile_time_arg_val(15)); + static_cast(get_compile_time_arg_val(13)); -constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(16); -constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(17); -constexpr uint32_t test_command = get_compile_time_arg_val(18); +constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(14); +constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(15); +constexpr uint32_t test_command = get_compile_time_arg_val(16); -uint32_t base_target_address = get_compile_time_arg_val(19); +uint32_t base_target_address = get_compile_time_arg_val(17); // atomic increment for the ATOMIC_INC command -constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); +constexpr uint32_t atomic_increment = get_compile_time_arg_val(18); // constexpr uint32_t dest_device = get_compile_time_arg_val(21); uint32_t dest_device; -constexpr uint32_t signal_address = get_compile_time_arg_val(21); +constexpr uint32_t signal_address = get_compile_time_arg_val(19); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(20); uint32_t max_packet_size_mask; auto input_queue_state = select_input_queue(); volatile local_pull_request_t *local_pull_request = (volatile local_pull_request_t *)(data_buffer_start_addr - 1024); -tt_l1_ptr volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table = - reinterpret_cast(routing_table_start_addr); +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = + reinterpret_cast(routing_table_start_addr); +volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; fvc_producer_state_t test_producer __attribute__((aligned(16))); +fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); uint64_t xy_local_addr; @@ -73,6 +77,9 @@ uint32_t target_address; uint32_t noc_offset; uint32_t rx_addr_hi; +uint32_t gk_interface_addr_l; +uint32_t gk_interface_addr_h; + // generates packets with random size and payload on the input side inline bool test_buffer_handler_async_wr() { if (input_queue_state.all_packets_done()) { @@ -243,35 +250,108 @@ inline bool test_buffer_handler_atomic_inc() { return false; } +inline bool test_buffer_handler_fvcc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = SYNC; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.src_dev_id = routing_table->my_device_id; + packet_header.routing.src_mesh_id = routing_table->my_mesh_id; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ASYNC_WR_RESP; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.misc_parameters.words[1] = 0; + packet_header.packet_parameters.misc_parameters.words[2] = 0; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); + return false; +} + bool test_buffer_handler() { if constexpr (test_command == ASYNC_WR) { return test_buffer_handler_async_wr(); } else if constexpr (test_command == ATOMIC_INC) { return test_buffer_handler_atomic_inc(); + } else if constexpr (test_command == ASYNC_WR_RESP) { + return test_buffer_handler_fvcc(); } } void kernel_main() { tt_fabric_init(); + uint32_t rt_args_idx = 0; // TODO: refactor - src_endpoint_id = get_arg_val(0); - noc_offset = get_arg_val(1); - uint32_t router_x = get_arg_val(2); - uint32_t router_y = get_arg_val(3); - dest_device = get_arg_val(4); - uint32_t rx_buf_size = get_arg_val(5); + src_endpoint_id = get_arg_val(rt_args_idx++); + noc_offset = get_arg_val(rt_args_idx++); + uint32_t router_x = get_arg_val(rt_args_idx++); + uint32_t router_y = get_arg_val(rt_args_idx++); + dest_device = get_arg_val(rt_args_idx++); + uint32_t rx_buf_size = get_arg_val(rt_args_idx++); + gk_interface_addr_l = get_arg_val(rt_args_idx++); + gk_interface_addr_h = get_arg_val(rt_args_idx++); if (ASYNC_WR == test_command) { - base_target_address = get_arg_val(6); + base_target_address = get_arg_val(rt_args_idx++); } target_address = base_target_address; rx_addr_hi = base_target_address + rx_buf_size; uint64_t router_config_addr = NOC_XY_ADDR(NOC_X(router_x), NOC_Y(router_y), eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); - noc_async_read_one_packet( - router_config_addr, routing_table_start_addr, sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + noc_async_read_one_packet(router_config_addr, routing_table_start_addr, sizeof(fabric_router_l1_config_t)); noc_async_read_barrier(); zero_l1_buf(test_results, test_results_size_bytes); @@ -284,6 +364,9 @@ void kernel_main() { zero_l1_buf(reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + client_interface->gk_msg_buf_addr = + (((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf); if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { input_queue_state.init(src_endpoint_id, prng_seed); @@ -294,6 +377,7 @@ void kernel_main() { } test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); + fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); uint32_t temp = max_packet_size_words; max_packet_size_mask = 0; @@ -364,6 +448,15 @@ void kernel_main() { DPRINT << "Packet Header Corrupted: packet " << packet_count << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); break; + } else if (fvcc_test_producer.get_curr_packet_valid()) { + fvcc_test_producer.fvcc_handler(); +#ifdef CHECK_TIMEOUT + progress_timestamp = get_timestamp_32b(); +#endif + } else if (fvcc_test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); + break; } else if (all_packets_initialized) { DPRINT << "all packets done" << ENDL(); break; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp new file mode 100644 index 00000000000..39571c2a5e4 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp @@ -0,0 +1,507 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "debug/dprint.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen.hpp" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/tt_fabric_api.h" +// clang-format on + +using namespace tt::tt_fabric; + +uint32_t src_endpoint_id; +// constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); +constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); +static_assert(is_power_of_2(num_dest_endpoints), "num_dest_endpoints must be a power of 2"); +constexpr uint32_t dest_endpoint_start_id = get_compile_time_arg_val(2); + +constexpr uint32_t data_buffer_start_addr = get_compile_time_arg_val(3); +constexpr uint32_t data_buffer_size_words = get_compile_time_arg_val(4); + +constexpr uint32_t routing_table_start_addr = get_compile_time_arg_val(5); +constexpr uint32_t gk_interface_addr_l = get_compile_time_arg_val(6); +constexpr uint32_t gk_interface_addr_h = get_compile_time_arg_val(7); + +constexpr uint32_t test_results_addr_arg = get_compile_time_arg_val(8); +constexpr uint32_t test_results_size_bytes = get_compile_time_arg_val(9); + +tt_l1_ptr uint32_t* const test_results = reinterpret_cast(test_results_addr_arg); + +constexpr uint32_t prng_seed = get_compile_time_arg_val(10); + +constexpr uint32_t total_data_kb = get_compile_time_arg_val(11); +constexpr uint64_t total_data_words = ((uint64_t)total_data_kb) * 1024 / PACKET_WORD_SIZE_BYTES; + +constexpr uint32_t max_packet_size_words = get_compile_time_arg_val(12); + +static_assert(max_packet_size_words > 3, "max_packet_size_words must be greater than 3"); + +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(13); + +constexpr bool skip_pkt_content_gen = get_compile_time_arg_val(14); +constexpr pkt_dest_size_choices_t pkt_dest_size_choice = + static_cast(get_compile_time_arg_val(15)); + +constexpr uint32_t data_sent_per_iter_low = get_compile_time_arg_val(16); +constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(17); +constexpr uint32_t test_command = get_compile_time_arg_val(18); + +constexpr uint32_t base_target_address = get_compile_time_arg_val(19); +uint32_t target_address = base_target_address; + +// atomic increment for the ATOMIC_INC command +constexpr uint32_t atomic_increment = get_compile_time_arg_val(20); +// constexpr uint32_t dest_device = get_compile_time_arg_val(21); +uint32_t dest_device; + +constexpr uint32_t signal_address = get_compile_time_arg_val(21); +constexpr uint32_t client_interface_addr = get_compile_time_arg_val(22); +constexpr uint32_t client_pull_req_buf_addr = get_compile_time_arg_val(23); + +uint32_t max_packet_size_mask; + +auto input_queue_state = select_input_queue(); +volatile local_pull_request_t* local_pull_request = (volatile local_pull_request_t*)(data_buffer_start_addr - 1024); +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = + reinterpret_cast(routing_table_start_addr); +volatile fabric_client_interface_t* client_interface = (volatile fabric_client_interface_t*)client_interface_addr; +volatile tt_l1_ptr chan_req_buf* client_pull_req_buf = + reinterpret_cast(client_pull_req_buf_addr); + +fvc_producer_state_t test_producer __attribute__((aligned(16))); +fvcc_inbound_state_t fvcc_test_producer __attribute__((aligned(16))); + +uint64_t xy_local_addr; + +packet_header_t packet_header __attribute__((aligned(16))); + +uint32_t noc_offset; + +// generates packets with random size and payload on the input side +inline bool test_buffer_handler_dsocket_wr(socket_handle_t* socket_handle) { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = test_producer.get_num_words_free(); + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + // Each call to test_buffer_handler initializes only up to the end + // of the producer buffer. Since the header is 3 words, we need to handle + // split header cases, where the buffer has not enough space for the full header. + // In this case, we write as many words as space available, and remaining header words + // are written on next call. + uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_packet( + num_dest_endpoints, + dest_endpoint_start_id, + max_packet_size_words, + max_packet_size_mask, + total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = FORWARD; + packet_header.routing.packet_size_bytes = input_queue_state.curr_packet_size_words * PACKET_WORD_SIZE_BYTES; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.session.command = DSOCKET_WR; + packet_header.session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; + packet_header.session.target_offset_h = socket_handle->pull_notification_adddr >> 32; + target_address += packet_header.routing.packet_size_bytes - PACKET_HEADER_SIZE_BYTES; + packet_header.packet_parameters.misc_parameters.words[1] = input_queue_state.packet_rnd_seed; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + uint32_t words_remaining = words_to_init - words_initialized; + uint32_t packet_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + if (packet_words_initialized < PACKET_HEADER_SIZE_WORDS) { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = packet_words_initialized; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + header_words_to_init = std::min(words_remaining, header_words_to_init); + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + words_remaining = words_to_init - words_initialized; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + if (words_remaining == 0) { + // no space left for packet data. + break; + } + } + + uint32_t num_words = std::min(words_remaining, input_queue_state.curr_packet_words_remaining); + if constexpr (!skip_pkt_content_gen) { + uint32_t start_val = (input_queue_state.packet_rnd_seed & 0xFFFF0000) + + (input_queue_state.curr_packet_size_words - + input_queue_state.curr_packet_words_remaining - PACKET_HEADER_SIZE_WORDS); + fill_packet_data(reinterpret_cast(byte_wr_addr), num_words, start_val); + } + words_initialized += num_words; + input_queue_state.curr_packet_words_remaining -= num_words; + byte_wr_addr += num_words * PACKET_WORD_SIZE_BYTES; + } + } + test_producer.advance_local_wrptr(words_initialized); + return false; +} + +// generates packets with random size and payload on the input side +inline bool test_buffer_handler_atomic_inc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = test_producer.get_num_words_free(); + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = INLINE_FORWARD; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ATOMIC_INC; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.atomic_parameters.wrap_boundary = 31; + packet_header.packet_parameters.atomic_parameters.increment = atomic_increment; + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + test_producer.advance_local_wrptr(words_initialized); + return false; +} + +inline bool test_buffer_handler_fvcc() { + if (input_queue_state.all_packets_done()) { + return true; + } + + uint32_t free_words = fvcc_test_producer.get_num_msgs_free() * PACKET_HEADER_SIZE_WORDS; + if (free_words < PACKET_HEADER_SIZE_WORDS) { + return false; + } + + uint32_t byte_wr_addr = fvcc_test_producer.get_local_buffer_write_addr(); + uint32_t words_to_init = std::min(free_words, fvcc_test_producer.words_before_local_buffer_wrap()); + uint32_t words_initialized = 0; + while (words_initialized < words_to_init) { + if (input_queue_state.all_packets_done()) { + break; + } + + if (!input_queue_state.packet_active()) { // start of a new packet + input_queue_state.next_inline_packet(total_data_words); + + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + + packet_header.routing.flags = SYNC; + packet_header.routing.dst_mesh_id = dest_device >> 16; + packet_header.routing.dst_dev_id = dest_device & 0xFFFF; + packet_header.routing.src_dev_id = routing_table->my_device_id; + packet_header.routing.src_mesh_id = routing_table->my_mesh_id; + packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header.session.command = ASYNC_WR_RESP; + packet_header.session.target_offset_l = target_address; + packet_header.session.target_offset_h = noc_offset; + packet_header.packet_parameters.misc_parameters.words[1] = 0; + packet_header.packet_parameters.misc_parameters.words[2] = 0; + + tt_fabric_add_header_checksum(&packet_header); + uint32_t words_left = words_to_init - words_initialized; + bool split_header = words_left < PACKET_HEADER_SIZE_WORDS; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS; + if (split_header) { + header_words_to_init = words_left; + } + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i]; + } + + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } else { + tt_l1_ptr uint32_t* header_ptr = reinterpret_cast(byte_wr_addr); + uint32_t header_words_initialized = + input_queue_state.curr_packet_size_words - input_queue_state.curr_packet_words_remaining; + uint32_t header_words_to_init = PACKET_HEADER_SIZE_WORDS - header_words_initialized; + uint32_t header_dword_index = header_words_initialized * PACKET_WORD_SIZE_BYTES / 4; + uint32_t words_left = words_to_init - words_initialized; + header_words_to_init = std::min(words_left, header_words_to_init); + + for (uint32_t i = 0; i < (header_words_to_init * PACKET_WORD_SIZE_BYTES / 4); i++) { + header_ptr[i] = ((uint32_t*)&packet_header)[i + header_dword_index]; + } + words_initialized += header_words_to_init; + input_queue_state.curr_packet_words_remaining -= header_words_to_init; + byte_wr_addr += header_words_to_init * PACKET_WORD_SIZE_BYTES; + } + } + fvcc_test_producer.advance_local_wrptr(words_initialized / PACKET_HEADER_SIZE_WORDS); + return false; +} + +bool test_buffer_handler(socket_handle_t* socket_handle) { + if constexpr (test_command == DSOCKET_WR) { + return test_buffer_handler_dsocket_wr(socket_handle); + } else if constexpr (test_command == ATOMIC_INC) { + return test_buffer_handler_atomic_inc(); + } else if constexpr (test_command == SOCKET_OPEN) { + return test_buffer_handler_fvcc(); + } + return true; +} + +void kernel_main() { + tt_fabric_init(); + + // TODO: refactor + src_endpoint_id = get_arg_val(0); + noc_offset = get_arg_val(1); + uint32_t router_x = get_arg_val(2); + uint32_t router_y = get_arg_val(3); + dest_device = get_arg_val(4); + + if (ASYNC_WR == test_command) { + target_address = get_arg_val(5); + } + + uint64_t router_config_addr = NOC_XY_ADDR(router_x, router_y, eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); + noc_async_read_one_packet( + router_config_addr, routing_table_start_addr, sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + noc_async_read_barrier(); + + zero_l1_buf(test_results, test_results_size_bytes); + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; + test_results[PQ_TEST_STATUS_INDEX + 1] = (uint32_t)local_pull_request; + + test_results[PQ_TEST_MISC_INDEX] = 0xff000000; + test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; + + zero_l1_buf( + reinterpret_cast(data_buffer_start_addr), data_buffer_size_words * PACKET_WORD_SIZE_BYTES); + zero_l1_buf((uint32_t*)local_pull_request, sizeof(local_pull_request_t)); + zero_l1_buf((uint32_t*)&packet_header, sizeof(packet_header_t)); + zero_l1_buf((uint32_t*)client_interface, sizeof(fabric_client_interface_t)); + zero_l1_buf((uint32_t*)client_pull_req_buf, sizeof(chan_req_buf)); + client_interface->gk_interface_addr = ((uint64_t)gk_interface_addr_h << 32) | gk_interface_addr_l; + client_interface->gk_msg_buf_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, gk_msg_buf); + client_interface->pull_req_buf_addr = xy_local_addr | client_pull_req_buf_addr; + + // make sure fabric node gatekeeper is available. + fabric_endpoint_init(); + + if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM) { + input_queue_state.init(src_endpoint_id, prng_seed); + } else if constexpr (pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE) { + input_queue_state.init(max_packet_size_words, 0); + } else { + input_queue_state.init(src_endpoint_id, prng_seed); + } + + test_producer.init(data_buffer_start_addr, data_buffer_size_words, 0x0); + fvcc_test_producer.init(data_buffer_start_addr, 0x0, 0x0); + + uint32_t temp = max_packet_size_words; + max_packet_size_mask = 0; + temp >>= 1; + while (temp) { + max_packet_size_mask = (max_packet_size_mask << 1) + 1; + temp >>= 1; + } + if ((max_packet_size_mask + 1) != max_packet_size_words) { + // max_packet_size_words is not a power of 2 + // snap to next power of 2 mask + max_packet_size_mask = (max_packet_size_mask << 1) + 1; + } + + // wait till test sends start signal. This is set by test + // once tt_fabric kernels have been launched on all the test devices. + while (*(volatile tt_l1_ptr uint32_t*)signal_address == 0); + + test_results[PQ_TEST_MISC_INDEX] = 0xff000001; + + uint64_t data_words_sent = 0; + uint64_t iter = 0; + uint64_t zero_data_sent_iter = 0; + uint64_t few_data_sent_iter = 0; + uint64_t many_data_sent_iter = 0; + uint64_t words_flushed = 0; + bool timeout = false; + uint64_t start_timestamp = get_timestamp(); + uint32_t progress_timestamp = start_timestamp & 0xFFFFFFFF; + + uint32_t curr_packet_size = 0; + uint32_t curr_packet_words_sent = 0; + uint32_t packet_count = 0; + + socket_handle_t* socket_handle = fabric_socket_open( + 3, // the network plane to use for this socket + 2, // Temporal epoch for which the socket is being opened + 1, // Socket Id to open + SOCKET_TYPE_DGRAM, // Unicast, Multicast, SSocket, DSocket + SOCKET_DIRECTION_SEND, // Send or Receive + dest_device >> 16, // Remote mesh/device that is the socket data sender/receiver. + dest_device & 0xFFFF, + 0 // fabric virtual channel. + ); + + fabric_socket_connect(socket_handle); + DPRINT << "Socket 1 connected. Handle = " << (uint32_t)socket_handle << ENDL(); + + while (true) { + iter++; +#ifdef CHECK_TIMEOUT + if (timeout_cycles > 0) { + uint32_t cycles_since_progress = get_timestamp_32b() - progress_timestamp; + if (cycles_since_progress > timeout_cycles) { + timeout = true; + break; + } + } +#endif + + bool all_packets_initialized = test_buffer_handler(socket_handle); + + if (test_producer.get_curr_packet_valid()) { + curr_packet_size = + (test_producer.current_packet_header.routing.packet_size_bytes + PACKET_WORD_SIZE_BYTES - 1) >> 4; + uint32_t curr_data_words_sent = test_producer.pull_data_from_fvc_buffer(); + curr_packet_words_sent += curr_data_words_sent; + data_words_sent += curr_data_words_sent; + if constexpr (!(data_sent_per_iter_low == 0 && data_sent_per_iter_high == 0)) { + zero_data_sent_iter += static_cast(curr_data_words_sent <= 0); + few_data_sent_iter += static_cast(curr_data_words_sent <= data_sent_per_iter_low); + many_data_sent_iter += static_cast(curr_data_words_sent >= data_sent_per_iter_high); + } +#ifdef CHECK_TIMEOUT + progress_timestamp = (curr_data_words_sent > 0) ? get_timestamp_32b() : progress_timestamp; +#endif + if (curr_packet_words_sent == curr_packet_size) { + curr_packet_words_sent = 0; + packet_count++; + } + } else if (test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << test_producer.get_local_buffer_read_addr() << ENDL(); + break; + } else if (fvcc_test_producer.get_curr_packet_valid()) { + fvcc_test_producer.fvcc_handler(); +#ifdef CHECK_TIMEOUT + progress_timestamp = get_timestamp_32b(); +#endif + } else if (fvcc_test_producer.packet_corrupted) { + DPRINT << "Packet Header Corrupted: packet " << packet_count + << " Addr: " << fvcc_test_producer.get_local_buffer_read_addr() << ENDL(); + break; + } else if (all_packets_initialized) { + DPRINT << "all packets done" << ENDL(); + break; + } + } + + /* + fabric_socket_open(3, 4, 2, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & 0xFFFF, + 0); fabric_socket_open(3, 6, 3, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device & + 0xFFFF, 0); fabric_socket_open(3, 8, 4, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, dest_device + & 0xFFFF, 0); fabric_socket_open(3, 10, 5, SOCKET_TYPE_DGRAM, SOCKET_DIRECTION_SEND, dest_device >> 16, + dest_device & 0xFFFF, 0); + */ + uint64_t cycles_elapsed = get_timestamp() - start_timestamp; + + uint64_t num_packets = input_queue_state.get_num_packets(); + set_64b_result(test_results, data_words_sent, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(test_results, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); + set_64b_result(test_results, total_data_words, TX_TEST_IDX_TOT_DATA_WORDS); + set_64b_result(test_results, num_packets, TX_TEST_IDX_NPKT); + set_64b_result(test_results, zero_data_sent_iter, TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); + set_64b_result(test_results, few_data_sent_iter, TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); + set_64b_result(test_results, many_data_sent_iter, TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); + + if (test_producer.packet_corrupted) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_BAD_HEADER; + test_results[PQ_TEST_MISC_INDEX] = packet_count; + } else if (!timeout) { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_PASS; + test_results[PQ_TEST_MISC_INDEX] = packet_count; + } else { + test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_TIMEOUT; + set_64b_result(test_results, words_flushed, TX_TEST_IDX_WORDS_FLUSHED); + } +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index 352a3d107eb..32c955f159e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -16,6 +16,7 @@ using std::vector; using namespace tt; +using namespace tt::tt_fabric; using json = nlohmann::json; int main(int argc, char** argv) { @@ -23,6 +24,8 @@ int main(int argc, char** argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; + constexpr uint32_t default_gk_x = 0; + constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -155,6 +158,8 @@ int main(int argc, char** argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); + uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); + uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); @@ -219,6 +224,8 @@ int main(int argc, char** argv) { bool pass = true; + CoreCoord gk_core = {gk_x, gk_y}; + std::map defines = { {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; @@ -289,6 +296,15 @@ int main(int argc, char** argv) { bool router_core_found = false; CoreCoord router_logical_core; CoreCoord router_phys_core; + CoreCoord gk_phys_core; + uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); + log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); + log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); + log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); + for (auto device : device_map) { auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.second->id()); std::vector device_router_cores; @@ -322,17 +338,25 @@ int main(int argc, char** argv) { log_info(LogTest, "Device {} router_mask = 0x{:04X}", device.first, router_mask); uint32_t sem_count = device_router_cores.size(); device_router_map[device.first] = device_router_phys_cores; - std::vector runtime_args = { - sem_count, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask + + gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); + uint32_t gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); + + std::vector router_compile_args = { + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + tunneler_test_results_addr, // 1: test_results_addr + tunneler_test_results_size, // 2: test_results_size + 0, // 3: timeout_cycles }; + + std::vector router_runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + gk_interface_addr, // 2: gk_message_addr_l + gk_noc_offset, // 3: gk_message_addr_h + }; + for (auto logical_core : device_router_cores) { - std::vector router_compile_args = { - (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words - tunneler_test_results_addr, // 1: test_results_addr - tunneler_test_results_size, // 2: test_results_size - 0, // 3: timeout_cycles - }; auto router_kernel = tt_metal::CreateKernel( program_map[device.first], "tt_fabric/impl/kernels/tt_fabric_router.cpp", @@ -340,7 +364,7 @@ int main(int argc, char** argv) { tt_metal::EthernetConfig{ .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); - tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, runtime_args); + tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, router_runtime_args); log_debug( LogTest, @@ -348,6 +372,33 @@ int main(int argc, char** argv) { device.first, device.second->ethernet_core_from_logical_core(logical_core)); } + // setup runtime args + log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); + std::vector gk_compile_args = { + gk_interface_addr, // 0: + socket_info_addr, // 1: + routing_table_addr, // 2 + test_results_addr, // 3: test_results_addr + test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + + std::vector gk_runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + }; + + auto kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = gk_compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, gk_runtime_args); } if (check_txrx_timeout) { @@ -358,6 +409,7 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i < num_src_endpoints; i++) { CoreCoord core = {tx_x + i, tx_y}; tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); + CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); std::vector compile_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id num_dest_endpoints, // 1: num_dest_endpoints @@ -365,32 +417,36 @@ int main(int argc, char** argv) { tx_queue_start_addr, // 3: queue_start_addr_words (tx_queue_size_bytes >> 4), // 4: queue_size_words routing_table_start_addr, // 5: routeing table - 0, // 6: router_x - 0, // 7: router_y - test_results_addr, // 8: test_results_addr - test_results_size, // 9: test_results_size - prng_seed, // 10: prng_seed - data_kb_per_tx, // 11: total_data_kb - max_packet_size_words, // 12: max_packet_size_words - timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles - tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low - tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high - fabric_command, // 18: fabric command - target_address, - atomic_increment, - tx_signal_address + test_results_addr, // 6: test_results_addr + test_results_size, // 7: test_results_size + prng_seed, // 8: prng_seed + data_kb_per_tx, // 9: total_data_kb + max_packet_size_words, // 10: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 11: timeout_cycles + tx_skip_pkt_content_gen, // 12: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 13: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 14: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 15: data_sent_per_iter_high + fabric_command, // 16: fabric command + target_address, // 17: + atomic_increment, // 18: + tx_signal_address, // 19: + client_interface_addr, }; // setup runtime args + uint32_t tx_gk_noc_offset = tt_metal::hal.noc_xy_encoding(tx_gk_phys_core.x, tx_gk_phys_core.y); std::vector runtime_args = { (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id 0x410, // 1: dest_noc_offset - router_phys_core.x, - router_phys_core.y, - (dev_r_mesh_id << 16 | dev_r_chip_id)}; + router_phys_core.x, // 2: router_x + router_phys_core.y, // 3: router_y + (dev_r_mesh_id << 16 | dev_r_chip_id), // 4: mesh and chip id + 0xd0000, // 5: space in rx's L1 + gk_interface_addr, // 6: gk_message_addr_l + tx_gk_noc_offset, // 7: gk_message_addr_h + }; if (ASYNC_WR == fabric_command) { runtime_args.push_back(target_address); @@ -421,10 +477,12 @@ int main(int argc, char** argv) { // Initialize tt_fabric router sync semaphore to 0. Routers on each device // increment once handshake with ethernet peer has been completed. std::vector zero_buf(1, 0); + std::vector gk_zero_buf(12, 0); for (auto [device_id, router_phys_cores] : device_router_map) { for (auto phys_core : router_phys_cores) { tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); } for (uint32_t i = 0; i < num_src_endpoints; i++) { @@ -456,11 +514,10 @@ int main(int argc, char** argv) { log_info(LogTest, "Tx Finished"); - // terminate fabric routers + // terminate gatekeeper. + // Gatekeeper will signal all routers on the device to terminate. for (auto [device_id, router_phys_cores] : device_router_map) { - for (auto phys_core : router_phys_cores) { - tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); - } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); } // wait for all kernels to finish. diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index d5a758454f5..e730c58c88b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -31,6 +31,16 @@ std::mt19937 global_rng; // decides if the tx puts the data directly on eth or if a noc hop is allowed as well bool allow_1st_noc_hop = false; +// Gatekeeper kernel coordinates +uint32_t gk_x, gk_y; + +// Check if gatekeeper runs on tensix worker or idle ethernet based on the board type +bool run_gk_on_idle_ethernet; + +uint32_t routing_table_addr; +uint32_t gk_interface_addr; +uint32_t socket_info_addr; + inline std::vector get_random_numbers_from_range(uint32_t start, uint32_t end, uint32_t count) { std::vector range(end - start + 1); @@ -101,6 +111,11 @@ typedef struct test_board { } else { physical_chip_ids = available_chip_ids; } + + // gatekeeper - run on idle ethernet for n300/T3K + if (("n300" == board_type_) || ("t3k" == board_type_)) { + run_gk_on_idle_ethernet = true; + } } void _init_galaxy_board(uint32_t num_chips) { @@ -331,10 +346,13 @@ typedef struct test_device { std::vector worker_cores; std::vector router_logical_cores; std::vector router_physical_cores; + CoreCoord gk_logical_core; + CoreCoord gk_phys_core; mesh_id_t mesh_id; chip_id_t logical_chip_id; uint32_t mesh_chip_id = 0; uint32_t router_mask = 0; + uint32_t gk_noc_offset; test_device(chip_id_t chip_id_, test_board_t* board_handle_) { physical_chip_id = chip_id_; @@ -367,6 +385,21 @@ typedef struct test_device { router_mask += 0x1 << logical_core.y; } } + + // gatekeeper + if (run_gk_on_idle_ethernet) { + auto idle_eth_cores = device_handle->get_inactive_ethernet_cores(); + if (idle_eth_cores.size() == 0) { + throw std::runtime_error("No idle ethernet cores found on the device"); + } + + gk_logical_core = *idle_eth_cores.begin(); + gk_phys_core = device_handle->ethernet_core_from_logical_core(gk_logical_core); + } else { + gk_logical_core = {gk_x, gk_y}; + gk_phys_core = device_handle->worker_core_from_logical_core(gk_logical_core); + } + gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); } void create_router_kernels(std::vector& compile_args, std::map& defines) { @@ -376,8 +409,10 @@ typedef struct test_device { for (auto i = 0; i < num_routers; i++) { // setup run time args std::vector runtime_args = { - num_routers, // 0: number of active fabric routers - router_mask, // 1: active fabric router mask + num_routers, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + gk_interface_addr, // 2: gk_message_addr_l + gk_noc_offset, // 3: gk_message_addr_h }; // initialize the semaphore @@ -395,6 +430,45 @@ typedef struct test_device { } } + void create_gatekeeper_kernel(std::vector& compile_args, std::map& defines) { + uint32_t num_routers = router_logical_cores.size(); + std::vector zero_buf(12, 0); + + std::vector runtime_args = { + num_routers, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + }; + + // initialize the semaphore + tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); + + KernelHandle kernel; + + if (run_gk_on_idle_ethernet) { + kernel = tt_metal::CreateKernel( + program_handle, + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_logical_core}, + tt_metal::EthernetConfig{ + .eth_mode = Eth::IDLE, + .noc = tt_metal::NOC::NOC_0, + .compile_args = compile_args, + .defines = defines}); + } else { + kernel = tt_metal::CreateKernel( + program_handle, + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_logical_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + .defines = defines}); + } + + tt_metal::SetRuntimeArgs(program_handle, kernel, gk_logical_core, runtime_args); + } + void terminate_router_kernels() { std::vector zero_buf(1, 0); for (auto& core : router_physical_cores) { @@ -402,6 +476,11 @@ typedef struct test_device { } } + void terminate_gatekeeper_kernel() { + std::vector zero_buf(12, 0); + tt::llrt::write_hex_vec_to_core(device_handle->id(), gk_phys_core, zero_buf, gk_interface_addr); + } + std::vector select_random_worker_cores(uint32_t count) { std::vector result; @@ -530,7 +609,9 @@ typedef struct test_traffic { router_phys_core.x, // 2: router_x router_phys_core.y, // 3: router_y mesh_chip_id, // 4: mesh and chip id - rx_buf_size // 5: space in rx's L1 + rx_buf_size, // 5: space in rx's L1 + gk_interface_addr, // 6: gk_message_addr_l + tx_device->gk_noc_offset, // 7: gk_message_addr_h }; if (ASYNC_WR == fabric_command) { @@ -831,6 +912,8 @@ int main(int argc, char **argv) { constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; constexpr uint32_t default_rx_y = 3; + constexpr uint32_t default_gk_x = 0; + constexpr uint32_t default_gk_y = 9; constexpr uint32_t default_mux_x = 0; constexpr uint32_t default_mux_y = 1; @@ -939,6 +1022,8 @@ int main(int argc, char **argv) { uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); + gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); + gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); uint32_t data_kb_per_tx = test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); uint32_t max_packet_size_words = test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); @@ -1107,7 +1192,15 @@ int main(int argc, char **argv) { throw std::runtime_error("Test cannot run on specified device."); } */ - // launch router kernels + if (run_gk_on_idle_ethernet) { + routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); + } else { + routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + } + gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); + + // create router kernels std::vector router_compile_args = { (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words tunneler_test_results_addr, // 1: test_results_addr @@ -1118,10 +1211,26 @@ int main(int argc, char **argv) { test_device->create_router_kernels(router_compile_args, defines); } + // create gatekeeper kernel + std::vector gatekeeper_compile_args = { + gk_interface_addr, // 0: gk info addr + socket_info_addr, // 1: + routing_table_addr, // 2: + test_results_addr, // 3: test_results_addr + test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + for (auto& [chip_id, test_device] : test_devices) { + test_device->create_gatekeeper_kernel(gatekeeper_compile_args, defines); + } + if (check_txrx_timeout) { defines["CHECK_TIMEOUT"] = ""; } + uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_client_interface_t); + std::vector tx_compile_args = { 0, //(device->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id num_dest_endpoints, // 1: num_dest_endpoints @@ -1129,22 +1238,22 @@ int main(int argc, char **argv) { tx_queue_start_addr, // 3: queue_start_addr_words (tx_queue_size_bytes >> 4), // 4: queue_size_words routing_table_start_addr, // 5: routeing table - 0, // tunneler_phys_core.x, // 6: router_x - 0, // tunneler_phys_core.y, // 7: router_y - test_results_addr, // 8: test_results_addr - test_results_size, // 9: test_results_size - prng_seed, // 10: prng_seed - data_kb_per_tx, // 11: total_data_kb - max_packet_size_words, // 12: max_packet_size_words - timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles - tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen - tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice - tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low - tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high - fabric_command, // 18: fabric_command - target_address, // 19: target_address - atomic_increment, // 20: atomic_increment - tx_signal_address // 21: tx_signal_address + test_results_addr, // 6: test_results_addr + test_results_size, // 7: test_results_size + prng_seed, // 8: prng_seed + data_kb_per_tx, // 9: total_data_kb + max_packet_size_words, // 10: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 11: timeout_cycles + tx_skip_pkt_content_gen, // 12: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 13: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 14: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 15: data_sent_per_iter_high + fabric_command, // 16: fabric_command + target_address, // 17: target_address + atomic_increment, // 18: atomic_increment + tx_signal_address, // 19: tx_signal_address + client_interface_addr, // 20: + client_pull_req_buf_addr, // 21: }; std::vector rx_compile_args = { @@ -1189,7 +1298,7 @@ int main(int argc, char **argv) { // terminate fabric routers for (auto& [chip_id, test_device] : test_devices) { - test_device->terminate_router_kernels(); + test_device->terminate_gatekeeper_kernel(); } // wait for programs to exit diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp new file mode 100644 index 00000000000..e166f43706d --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -0,0 +1,659 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include "tt_fabric/control_plane.hpp" +// #include "tt_metal/impl/dispatch/cq_commands.hpp" +// #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" +#include "kernels/tt_fabric_traffic_gen_test.hpp" +#include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp" +#include "eth_l1_address_map.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" + +using std::vector; +using namespace tt; +using namespace tt::tt_fabric; +using json = nlohmann::json; + +int main(int argc, char** argv) { + constexpr uint32_t default_tx_x = 0; + constexpr uint32_t default_tx_y = 0; + constexpr uint32_t default_rx_x = 0; + constexpr uint32_t default_rx_y = 3; + constexpr uint32_t default_gk_x = 0; + constexpr uint32_t default_gk_y = 9; + + constexpr uint32_t default_mux_x = 0; + constexpr uint32_t default_mux_y = 1; + constexpr uint32_t default_demux_x = 0; + constexpr uint32_t default_demux_y = 2; + + constexpr uint32_t default_prng_seed = 0x100; + constexpr uint32_t default_data_kb_per_tx = 1024 * 1024; + constexpr uint32_t default_max_packet_size_words = 0x100; + + constexpr uint32_t default_routing_table_start_addr = 0x7EC00; + constexpr uint32_t default_tx_queue_start_addr = 0x80000; + constexpr uint32_t default_tx_queue_size_bytes = 0x10000; + constexpr uint32_t default_rx_queue_start_addr = 0x80000; + constexpr uint32_t default_rx_queue_size_bytes = 0x10000; + constexpr uint32_t default_tx_signal_address = 0x70000; + + constexpr uint32_t default_test_results_addr = 0x100000; + constexpr uint32_t default_test_results_size = 0x40000; + + // TODO: use eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE which should be 0x19900, set test results size back + // to 0x7000 + constexpr uint32_t default_tunneler_queue_size_bytes = + 0x4000; // max fvc size (send + receive. top half is send buffer, bottom half is receive buffer) + constexpr uint32_t default_tunneler_test_results_addr = + 0x39000; // 0x8000 * 4 + 0x19000; 0x10000 * 4 + 0x19000 = 0x59000 > 0x40000 (256kB) + constexpr uint32_t default_tunneler_test_results_size = 0x6000; // 256kB total L1 in ethernet core - 0x39000 + + constexpr uint32_t default_timeout_mcycles = 1000; + constexpr uint32_t default_rx_disable_data_check = 0; + constexpr uint32_t default_rx_disable_header_check = 0; + constexpr uint32_t default_tx_skip_pkt_content_gen = 0; + constexpr uint32_t default_check_txrx_timeout = 1; + + constexpr uint32_t src_endpoint_start_id = 4; + constexpr uint32_t dest_endpoint_start_id = 11; + + constexpr uint32_t num_endpoints = 1; + constexpr uint32_t num_src_endpoints = num_endpoints; + constexpr uint32_t num_dest_endpoints = num_endpoints; + + constexpr uint8_t default_tx_pkt_dest_size_choice = 0; // pkt_dest_size_choices_t + + constexpr uint32_t default_tx_data_sent_per_iter_low = 20; + constexpr uint32_t default_tx_data_sent_per_iter_high = 240; + + constexpr uint32_t default_fabric_command = 1; + + constexpr uint32_t default_dump_stat_json = 0; + constexpr const char* default_output_dir = "/tmp"; + + constexpr uint32_t default_test_device_id_l = 0; + constexpr uint32_t default_test_device_id_r = -1; + + constexpr uint32_t default_target_address = 0x30000; + + constexpr uint32_t default_atomic_increment = 4; + + std::vector input_args(argv, argv + argc); + if (test_args::has_command_option(input_args, "-h") || test_args::has_command_option(input_args, "--help")) { + log_info(LogTest, "Usage:"); + log_info(LogTest, " --prng_seed: PRNG seed, default = 0x{:x}", default_prng_seed); + log_info(LogTest, " --data_kb_per_tx: Total data in KB per TX endpoint, default = {}", default_data_kb_per_tx); + log_info( + LogTest, + " --max_packet_size_words: Max packet size in words, default = 0x{:x}", + default_max_packet_size_words); + log_info(LogTest, " --tx_x: X coordinate of the starting TX core, default = {}", default_tx_x); + log_info(LogTest, " --tx_y: Y coordinate of the starting TX core, default = {}", default_tx_y); + log_info(LogTest, " --rx_x: X coordinate of the starting RX core, default = {}", default_rx_x); + log_info(LogTest, " --rx_y: Y coordinate of the starting RX core, default = {}", default_rx_y); + log_info( + LogTest, + " --routing_table_start_addr: Routing Table start address, default = 0x{:x}", + default_routing_table_start_addr); + log_info( + LogTest, " --tx_queue_start_addr: TX queue start address, default = 0x{:x}", default_tx_queue_start_addr); + log_info( + LogTest, " --tx_queue_size_bytes: TX queue size in bytes, default = 0x{:x}", default_tx_queue_size_bytes); + log_info( + LogTest, " --rx_queue_start_addr: RX queue start address, default = 0x{:x}", default_rx_queue_start_addr); + log_info( + LogTest, " --rx_queue_size_bytes: RX queue size in bytes, default = 0x{:x}", default_rx_queue_size_bytes); + log_info( + LogTest, " --test_results_addr: test results buf address, default = 0x{:x}", default_test_results_addr); + log_info(LogTest, " --test_results_size: test results buf size, default = 0x{:x}", default_test_results_size); + log_info(LogTest, " --timeout_mcycles: Timeout in MCycles, default = {}", default_timeout_mcycles); + log_info( + LogTest, + " --check_txrx_timeout: Check if timeout happens during tx & rx (if enabled, timeout_mcycles will also be " + "used), default = {}", + default_check_txrx_timeout); + log_info( + LogTest, + " --rx_disable_data_check: Disable data check on RX, default = {}", + default_rx_disable_data_check); + log_info( + LogTest, + " --rx_disable_header_check: Disable header check on RX, default = {}", + default_rx_disable_header_check); + log_info( + LogTest, + " --tx_skip_pkt_content_gen: Skip packet content generation during tx, default = {}", + default_tx_skip_pkt_content_gen); + log_info( + LogTest, + " --tx_pkt_dest_size_choice: choice for how packet destination and packet size are generated, default = " + "{}", + default_tx_pkt_dest_size_choice); // pkt_dest_size_choices_t + log_info( + LogTest, + " --tx_data_sent_per_iter_low: the criteria to determine the amount of tx data sent per iter is low " + "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", + default_tx_data_sent_per_iter_low); + log_info( + LogTest, + " --tx_data_sent_per_iter_high: the criteria to determine the amount of tx data sent per iter is high " + "(unit: words); if both 0, then disable counting it in tx kernel, default = {}", + default_tx_data_sent_per_iter_high); + log_info(LogTest, " --dump_stat_json: Dump stats in json to output_dir, default = {}", default_dump_stat_json); + log_info(LogTest, " --output_dir: Output directory, default = {}", default_output_dir); + log_info( + LogTest, " --device_id: Device on which the test will be run, default = {}", default_test_device_id_l); + log_info( + LogTest, " --device_id_r: Device on which the test will be run, default = {}", default_test_device_id_r); + return 0; + } + + uint32_t tx_x = test_args::get_command_option_uint32(input_args, "--tx_x", default_tx_x); + uint32_t tx_y = test_args::get_command_option_uint32(input_args, "--tx_y", default_tx_y); + uint32_t rx_x = test_args::get_command_option_uint32(input_args, "--rx_x", default_rx_x); + uint32_t rx_y = test_args::get_command_option_uint32(input_args, "--rx_y", default_rx_y); + uint32_t gk_x = test_args::get_command_option_uint32(input_args, "--gk_x", default_gk_x); + uint32_t gk_y = test_args::get_command_option_uint32(input_args, "--gk_y", default_gk_y); + uint32_t prng_seed = test_args::get_command_option_uint32(input_args, "--prng_seed", default_prng_seed); + uint32_t data_kb_per_tx = + test_args::get_command_option_uint32(input_args, "--data_kb_per_tx", default_data_kb_per_tx); + uint32_t max_packet_size_words = + test_args::get_command_option_uint32(input_args, "--max_packet_size_words", default_max_packet_size_words); + uint32_t routing_table_start_addr = test_args::get_command_option_uint32( + input_args, "--routing_table_start_addr", default_routing_table_start_addr); + uint32_t tx_queue_start_addr = + test_args::get_command_option_uint32(input_args, "--tx_queue_start_addr", default_tx_queue_start_addr); + uint32_t tx_queue_size_bytes = + test_args::get_command_option_uint32(input_args, "--tx_queue_size_bytes", default_tx_queue_size_bytes); + uint32_t rx_queue_start_addr = + test_args::get_command_option_uint32(input_args, "--rx_queue_start_addr", default_rx_queue_start_addr); + uint32_t rx_queue_size_bytes = + test_args::get_command_option_uint32(input_args, "--rx_queue_size_bytes", default_rx_queue_size_bytes); + uint32_t tunneler_queue_size_bytes = test_args::get_command_option_uint32( + input_args, "--tunneler_queue_size_bytes", default_tunneler_queue_size_bytes); + uint32_t test_results_addr = + test_args::get_command_option_uint32(input_args, "--test_results_addr", default_test_results_addr); + uint32_t test_results_size = + test_args::get_command_option_uint32(input_args, "--test_results_size", default_test_results_size); + uint32_t tunneler_test_results_addr = test_args::get_command_option_uint32( + input_args, "--tunneler_test_results_addr", default_tunneler_test_results_addr); + uint32_t tunneler_test_results_size = test_args::get_command_option_uint32( + input_args, "--tunneler_test_results_size", default_tunneler_test_results_size); + uint32_t timeout_mcycles = + test_args::get_command_option_uint32(input_args, "--timeout_mcycles", default_timeout_mcycles); + uint32_t rx_disable_data_check = + test_args::get_command_option_uint32(input_args, "--rx_disable_data_check", default_rx_disable_data_check); + uint32_t rx_disable_header_check = + test_args::get_command_option_uint32(input_args, "--rx_disable_header_check", default_rx_disable_header_check); + uint32_t tx_skip_pkt_content_gen = + test_args::get_command_option_uint32(input_args, "--tx_skip_pkt_content_gen", default_tx_skip_pkt_content_gen); + uint32_t dump_stat_json = + test_args::get_command_option_uint32(input_args, "--dump_stat_json", default_dump_stat_json); + std::string output_dir = test_args::get_command_option(input_args, "--output_dir", std::string(default_output_dir)); + uint32_t check_txrx_timeout = + test_args::get_command_option_uint32(input_args, "--check_txrx_timeout", default_check_txrx_timeout); + uint8_t tx_pkt_dest_size_choice = (uint8_t)test_args::get_command_option_uint32( + input_args, "--tx_pkt_dest_size_choice", default_tx_pkt_dest_size_choice); + uint32_t tx_data_sent_per_iter_low = test_args::get_command_option_uint32( + input_args, "--tx_data_sent_per_iter_low", default_tx_data_sent_per_iter_low); + uint32_t tx_data_sent_per_iter_high = test_args::get_command_option_uint32( + input_args, "--tx_data_sent_per_iter_high", default_tx_data_sent_per_iter_high); + uint32_t fabric_command = + test_args::get_command_option_uint32(input_args, "--fabric_command", default_fabric_command); + uint32_t target_address = + test_args::get_command_option_uint32(input_args, "--target_address", default_target_address); + uint32_t atomic_increment = + test_args::get_command_option_uint32(input_args, "--atomic_increment", default_atomic_increment); + assert( + (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::SAME_START_RNDROBIN_FIX_SIZE && + rx_disable_header_check || + (pkt_dest_size_choices_t)tx_pkt_dest_size_choice == pkt_dest_size_choices_t::RANDOM); + + uint32_t test_device_id_l = + test_args::get_command_option_uint32(input_args, "--device_id", default_test_device_id_l); + uint32_t test_device_id_r = + test_args::get_command_option_uint32(input_args, "--device_id_r", default_test_device_id_r); + + uint32_t tx_signal_address = default_tx_signal_address; + + bool pass = true; + + CoreCoord gk_core = {gk_x, gk_y}; + + std::map defines = { + {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth + }; + + try { + const std::filesystem::path tg_mesh_graph_desc_path = + std::filesystem::path(tt::llrt::RunTimeOptions::get_instance().get_root_dir()) / + "tt_fabric/mesh_graph_descriptors/tg_mesh_graph_descriptor.yaml"; + auto control_plane = std::make_unique(tg_mesh_graph_desc_path.string()); + + int num_devices = tt_metal::GetNumAvailableDevices(); + if (test_device_id_l >= num_devices) { + log_info( + LogTest, "Device {} is not valid. Highest valid device id = {}.", test_device_id_l, num_devices - 1); + throw std::runtime_error("Invalid Device Id."); + } + + std::map device_map; + + std::vector chip_ids; + for (unsigned int id = 4; id < 36; id++) { + chip_ids.push_back(id); + } + device_map = tt::tt_metal::detail::CreateDevices(chip_ids); + + log_info(LogTest, "Created {} Devices ...", device_map.size()); + + std::map program_map; + + for (uint32_t i = 4; i < 36; i++) { + program_map[i] = tt_metal::CreateProgram(); + } + + log_info(LogTest, "Created Programs ..."); + + std::map> device_router_map; + + const auto& device_active_eth_cores = device_map[test_device_id_l]->get_active_ethernet_cores(); + + if (device_active_eth_cores.size() == 0) { + log_info( + LogTest, + "Device {} does not have enough active cores. Need 1 active ethernet core for this test.", + test_device_id_l); + for (auto dev : device_map) { + tt_metal::CloseDevice(dev.second); + } + + throw std::runtime_error("Test cannot run on specified device."); + } + + auto [dev_l_mesh_id, dev_l_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_l); + auto [dev_r_mesh_id, dev_r_chip_id] = control_plane->get_mesh_chip_id_from_physical_chip_id(test_device_id_r); + + log_info( + LogTest, + "Running on Left Device {} : Fabric Mesh Id {} : Fabric Device Id {}", + test_device_id_l, + dev_l_mesh_id, + dev_l_chip_id); + log_info( + LogTest, + "Running on Right Device {} : Fabric Mesh Id {} : Fabric Device Id {}", + test_device_id_r, + dev_r_mesh_id, + dev_r_chip_id); + + bool router_core_found = false; + CoreCoord router_logical_core; + CoreCoord router_phys_core; + CoreCoord gk_phys_core; + uint32_t routing_table_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); + uint32_t gk_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + uint32_t client_interface_addr = routing_table_addr + sizeof(fabric_router_l1_config_t) * 4; + uint32_t client_pull_req_buf_addr = client_interface_addr + sizeof(fabric_client_interface_t); + uint32_t socket_info_addr = gk_interface_addr + sizeof(gatekeeper_info_t); + log_info(LogTest, "GK Routing Table Addr = 0x{:08X}", routing_table_addr); + log_info(LogTest, "GK Info Addr = 0x{:08X}", gk_interface_addr); + log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr); + + for (auto device : device_map) { + auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.first); + std::vector device_router_cores; + std::vector device_router_phys_cores; + uint32_t router_mask = 0; + for (auto neighbor : neighbors) { + if (device_map.contains(neighbor)) { + if (!router_core_found && device.first == test_device_id_l) { + // pick a router so that tx and read in routing tables from this core on the + // sender device. + router_logical_core = device.second->get_ethernet_sockets(neighbor)[0]; + router_phys_core = device.second->ethernet_core_from_logical_core(router_logical_core); + router_core_found = true; + } + auto connected_logical_cores = device.second->get_ethernet_sockets(neighbor); + for (auto logical_core : connected_logical_cores) { + device_router_cores.push_back(logical_core); + device_router_phys_cores.push_back( + device.second->ethernet_core_from_logical_core(logical_core)); + router_mask += 0x1 << logical_core.y; + } + } else { + log_debug( + LogTest, + "Device {} skiping Neighbor Device {} since it is not in test device map.", + device.first, + neighbor); + } + } + + log_info(LogTest, "Device {} router_mask = 0x{:04X}", device.first, router_mask); + uint32_t sem_count = device_router_cores.size(); + device_router_map[device.first] = device_router_phys_cores; + + gk_phys_core = (device.second->worker_core_from_logical_core(gk_core)); + uint32_t gk_noc_offset = tt_metal::hal.noc_xy_encoding(gk_phys_core.x, gk_phys_core.y); + + std::vector router_compile_args = { + (tunneler_queue_size_bytes >> 4), // 0: rx_queue_size_words + tunneler_test_results_addr, // 1: test_results_addr + tunneler_test_results_size, // 2: test_results_size + 0, // 3: timeout_cycles + }; + + std::vector router_runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + gk_interface_addr, // 2: gk_message_addr_l + gk_noc_offset, // 3: gk_message_addr_h + }; + + for (auto logical_core : device_router_cores) { + auto router_kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_router.cpp", + logical_core, + tt_metal::EthernetConfig{ + .noc = tt_metal::NOC::NOC_0, .compile_args = router_compile_args, .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], router_kernel, logical_core, router_runtime_args); + + log_debug( + LogTest, + "Device {} router added on physical core {}", + device.first, + device.second->ethernet_core_from_logical_core(logical_core)); + } + // setup runtime args + log_info(LogTest, "run tt_fabric gatekeeper at x={},y={}", gk_core.x, gk_core.y); + std::vector gk_compile_args = { + gk_interface_addr, // 0: + socket_info_addr, // 1: + routing_table_addr, // 2 + test_results_addr, // 3: test_results_addr + test_results_size, // 4: test_results_size + 0, // 5: timeout_cycles + }; + + std::vector gk_runtime_args = { + sem_count, // 0: number of active fabric routers + router_mask, // 1: active fabric router mask + }; + + auto kernel = tt_metal::CreateKernel( + program_map[device.first], + "tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp", + {gk_core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = gk_compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[device.first], kernel, gk_core, gk_runtime_args); + } + + if (check_txrx_timeout) { + defines["CHECK_TIMEOUT"] = ""; + } + + std::vector tx_phys_core; + for (uint32_t i = 0; i < num_src_endpoints; i++) { + CoreCoord core = {tx_x + i, tx_y}; + tx_phys_core.push_back(device_map[test_device_id_l]->worker_core_from_logical_core(core)); + CoreCoord tx_gk_phys_core = device_map[test_device_id_l]->worker_core_from_logical_core(gk_core); + std::vector compile_args = { + (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + dest_endpoint_start_id, // 2: + tx_queue_start_addr, // 3: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 4: queue_size_words + routing_table_start_addr, // 5: routeing table + gk_interface_addr, // 6: gk_message_addr_l + (tx_gk_phys_core.y << 10) | (tx_gk_phys_core.x << 4), // 7: gk_message_addr_h + test_results_addr, // 8: test_results_addr + test_results_size, // 9: test_results_size + prng_seed, // 10: prng_seed + data_kb_per_tx, // 11: total_data_kb + max_packet_size_words, // 12: max_packet_size_words + timeout_mcycles * 1000 * 1000 * 4, // 13: timeout_cycles + tx_skip_pkt_content_gen, // 14: skip_pkt_content_gen + tx_pkt_dest_size_choice, // 15: pkt_dest_size_choice + tx_data_sent_per_iter_low, // 16: data_sent_per_iter_low + tx_data_sent_per_iter_high, // 17: data_sent_per_iter_high + fabric_command, // 18: fabric command + target_address, + atomic_increment, + tx_signal_address, + client_interface_addr, + client_pull_req_buf_addr, + }; + + // setup runtime args + std::vector runtime_args = { + (device_map[test_device_id_l]->id() << 8) + src_endpoint_start_id + i, // 0: src_endpoint_id + 0x410, // 1: dest_noc_offset + router_phys_core.x, + router_phys_core.y, + (dev_r_mesh_id << 16 | dev_r_chip_id)}; + + if (ASYNC_WR == fabric_command) { + runtime_args.push_back(target_address); + } + + log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); + auto kernel = tt_metal::CreateKernel( + program_map[test_device_id_l], + "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_tx_socket.cpp", + {core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[test_device_id_l], kernel, core, runtime_args); + } + + std::vector rx_phys_core; + for (uint32_t i = 0; i < num_dest_endpoints; i++) { + CoreCoord core = {rx_x + i, rx_y}; + rx_phys_core.push_back(device_map[test_device_id_r]->worker_core_from_logical_core(core)); + CoreCoord rx_gk_phys_core = device_map[test_device_id_r]->worker_core_from_logical_core(gk_core); + + std::vector compile_args = { + prng_seed, // 0: prng seed + data_kb_per_tx, // 1: total data kb + max_packet_size_words, // 2: max packet size (in words) + fabric_command, // 3: fabric command + target_address, // 4: target address + atomic_increment, // 5: atomic increment + test_results_addr, // 6: test results addr + test_results_size, // 7: test results size in bytes + gk_interface_addr, // 8: gk_message_addr_l + (rx_gk_phys_core.y << 10) | (rx_gk_phys_core.x << 4), // 9: gk_message_addr_h + client_interface_addr, // 10: + client_pull_req_buf_addr, // 11: + rx_queue_start_addr, // 12: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 13: queue_size_words + }; + + std::vector runtime_args = {(dev_l_mesh_id << 16 | dev_l_chip_id)}; + + log_info(LogTest, "run socket rx at x={},y={}", core.x, core.y); + auto kernel = tt_metal::CreateKernel( + program_map[test_device_id_r], + "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/tt_fabric_traffic_gen_rx_socket.cpp", + {core}, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .compile_args = compile_args, + .defines = defines}); + + tt_metal::SetRuntimeArgs(program_map[test_device_id_r], kernel, core, runtime_args); + } + + if (check_txrx_timeout) { + defines.erase("CHECK_TIMEOUT"); + } + + log_info(LogTest, "Starting test..."); + + auto start = std::chrono::system_clock::now(); + + // Initialize tt_fabric router sync semaphore to 0. Routers on each device + // increment once handshake with ethernet peer has been completed. + std::vector zero_buf(1, 0); + std::vector gk_zero_buf(12, 0); + for (auto [device_id, router_phys_cores] : device_router_map) { + for (auto phys_core : router_phys_cores) { + tt::llrt::write_hex_vec_to_core(device_id, phys_core, zero_buf, FABRIC_ROUTER_SYNC_SEM); + } + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, gk_zero_buf, gk_interface_addr); + } + + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], zero_buf, tx_signal_address); + } + + // clear test status to 0. it will be set to non-zero value by tx kernel. + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[0], zero_buf, test_results_addr); + + for (auto device : device_map) { + log_info(LogTest, "Launching on {}", device.first); + tt_metal::detail::LaunchProgram(device.second, program_map[device.first], false); + } + + // Once all the kernels have been launched on all devices, set the tx_start signal + // to trigger tx kernels to start sending data. + std::vector tx_start(1, 1); + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tt::llrt::write_hex_vec_to_core(test_device_id_l, tx_phys_core[i], tx_start, tx_signal_address); + } + + // Wait for tx to return non-zero status. + while (1) { + auto tx_status = tt::llrt::read_hex_vec_from_core(test_device_id_l, tx_phys_core[0], test_results_addr, 4); + if ((tx_status[0] & 0xFFFF) != 0) { + break; + } + } + + log_info(LogTest, "Tx Finished"); + + // terminate gatekeeper. + // Gatekeeper will signal all routers on the device to terminate. + for (auto [device_id, router_phys_cores] : device_router_map) { + tt::llrt::write_hex_vec_to_core(device_id, gk_phys_core, zero_buf, gk_interface_addr); + } + + // wait for all kernels to finish. + for (auto device : device_map) { + tt_metal::detail::WaitProgramDone(device.second, program_map[device.first]); + } + + auto end = std::chrono::system_clock::now(); + + std::chrono::duration elapsed_seconds = (end - start); + log_info(LogTest, "Ran in {:.2f}us", elapsed_seconds.count() * 1000 * 1000); + + vector> tx_results; + + for (uint32_t i = 0; i < num_src_endpoints; i++) { + tx_results.push_back(tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_l]->id(), tx_phys_core[i], test_results_addr, 128)); + log_info( + LogTest, + "TX{} status = {}", + i, + packet_queue_test_status_to_string(tx_results[i][PQ_TEST_STATUS_INDEX])); + pass &= (tx_results[i][PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + } + /* + TODO: Need to add these once control plane api is available to + get the first and last hop fabric router core. + vector router_results = + tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_l]->id(), tunneler_phys_core, tunneler_test_results_addr, 128); + log_info(LogTest, "L Router status = {}", + packet_queue_test_status_to_string(router_results[PQ_TEST_STATUS_INDEX])); pass &= + (router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + + vector r_router_results = + tt::llrt::read_hex_vec_from_core( + device_map[test_device_id_r]->id(), r_tunneler_phys_core, tunneler_test_results_addr, 128); + log_info(LogTest, "R Router status = {}", + packet_queue_test_status_to_string(r_router_results[PQ_TEST_STATUS_INDEX])); pass &= + (r_router_results[PQ_TEST_STATUS_INDEX] == PACKET_QUEUE_TEST_PASS); + */ + for (auto active_device : device_map) { + pass &= tt_metal::CloseDevice(active_device.second); + } + + if (pass) { + double total_tx_bw = 0.0; + uint64_t total_tx_words_sent = 0; + uint64_t total_rx_words_checked = 0; + for (uint32_t i = 0; i < num_src_endpoints; i++) { + uint64_t tx_words_sent = get_64b_result(tx_results[i], PQ_TEST_WORD_CNT_INDEX); + total_tx_words_sent += tx_words_sent; + uint64_t tx_elapsed_cycles = get_64b_result(tx_results[i], PQ_TEST_CYCLES_INDEX); + double tx_bw = ((double)tx_words_sent) * PACKET_WORD_SIZE_BYTES / tx_elapsed_cycles; + total_tx_bw += tx_bw; + uint64_t iter = get_64b_result(tx_results[i], PQ_TEST_ITER_INDEX); + // uint64_t zero_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_ZERO_DATA_WORDS_SENT_ITER); + // uint64_t few_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_FEW_DATA_WORDS_SENT_ITER); + // uint64_t many_data_sent_iter = get_64b_result(tx_results[i], TX_TEST_IDX_MANY_DATA_WORDS_SENT_ITER); + // uint64_t num_packets = get_64b_result(tx_results[i], TX_TEST_IDX_NPKT); + // double bytes_per_pkt = static_cast(tx_words_sent) * PACKET_WORD_SIZE_BYTES / + // static_cast(num_packets); + + log_info( + LogTest, + "TX {} words sent = {}, elapsed cycles = {} -> BW = {:.2f} B/cycle", + i, + tx_words_sent, + tx_elapsed_cycles, + tx_bw); + // log_info(LogTest, "TX {} packets sent = {}, bytes/packet = {:.2f}, total iter = {}, zero data sent + // iter = {}, few data sent iter = {}, many data sent iter = {}", i, num_packets, bytes_per_pkt, iter, + // zero_data_sent_iter, few_data_sent_iter, many_data_sent_iter); + /* + stat[fmt::format("tx_words_sent_{}", i)] = tx_words_sent; + stat[fmt::format("tx_elapsed_cycles_{}", i)] = tx_elapsed_cycles; + stat[fmt::format("tx_bw_{}", i)] = tx_bw; + stat[fmt::format("tx_bytes_per_pkt_{}", i)] = bytes_per_pkt; + stat[fmt::format("tx_total_iter_{}", i)] = iter; + stat[fmt::format("tx_zero_data_sent_iter_{}", i)] = zero_data_sent_iter; + stat[fmt::format("tx_few_data_sent_iter_{}", i)] = few_data_sent_iter; + stat[fmt::format("tx_many_data_sent_iter_{}", i)] = many_data_sent_iter; + */ + } + log_info(LogTest, "Total TX BW = {:.2f} B/cycle", total_tx_bw); + } + + } catch (const std::exception& e) { + pass = false; + log_fatal(e.what()); + } + + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); + + if (pass) { + log_info(LogTest, "Test Passed"); + return 0; + } else { + log_fatal(LogTest, "Test Failed\n"); + return 1; + } +} diff --git a/tt_fabric/hw/inc/routing_table.h b/tt_fabric/hw/inc/routing_table.h index aac39816745..70c862cc009 100644 --- a/tt_fabric/hw/inc/routing_table.h +++ b/tt_fabric/hw/inc/routing_table.h @@ -57,6 +57,7 @@ struct fabric_router_l1_config_t { std::uint16_t my_mesh_id; // Do we need this if we tag routing tables with magic values for outbound eth channels // and route to local NOC? std::uint16_t my_device_id; + std::uint8_t padding[8]; // pad to 16-byte alignment. } __attribute__((packed)); /* diff --git a/tt_fabric/hw/inc/tt_fabric.h b/tt_fabric/hw/inc/tt_fabric.h index d5196c6605a..92c38dc5e14 100644 --- a/tt_fabric/hw/inc/tt_fabric.h +++ b/tt_fabric/hw/inc/tt_fabric.h @@ -9,9 +9,11 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "hw/inc/routing_table.h" -#include "hw/inc/tt_fabric_interface.h" -#include "hw/inc/eth_chan_noc_mapping.h" +#include "tt_fabric/hw/inc/routing_table.h" +#include "tt_fabric/hw/inc/tt_fabric_interface.h" +#include "tt_fabric/hw/inc/eth_chan_noc_mapping.h" + +using namespace tt::tt_fabric; constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); @@ -21,10 +23,13 @@ const uint32_t SYNC_BUF_PTR_MASK = ((SYNC_BUF_SIZE << 1) - 1); extern uint64_t xy_local_addr; extern volatile local_pull_request_t* local_pull_request; -extern volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table; +extern volatile fabric_router_l1_config_t* routing_table; uint64_t tt_fabric_send_pull_request(uint64_t dest_addr, volatile local_pull_request_t* local_pull_request); -bool tt_fabric_is_header_valid(packet_header_t* p_header); +uint32_t num_words_available_to_pull(volatile pull_request_t* pull_request); +uint32_t words_before_buffer_wrap(uint32_t buffer_size, uint32_t rd_ptr); +uint32_t advance_ptr(uint32_t buffer_size, uint32_t ptr, uint32_t inc_words); +uint32_t get_rd_ptr_offset_words(pull_request_t* pull_request); inline uint64_t get_timestamp() { uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); @@ -471,7 +476,7 @@ typedef struct fvc_producer_state { } } - template + template inline uint32_t pull_data_from_fvc_buffer() { uint32_t words_available = get_num_words_available(); words_available = std::min(words_available, packet_words_remaining); @@ -493,7 +498,10 @@ typedef struct fvc_producer_state { packet_words_remaining -= words_available; advance_out_rdptr(words_available); // issue noc write to noc target of pull request. - uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START; + uint64_t dest_addr = socket_mode == false + ? ((uint64_t)get_next_hop_router_noc_xy() << 32) | FABRIC_ROUTER_REQ_QUEUE_START + : ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; packet_dest = tt_fabric_send_pull_request(dest_addr, local_pull_request); if (current_packet_header.routing.flags == INLINE_FORWARD) { curr_packet_valid = false; @@ -547,51 +555,59 @@ typedef struct fvc_producer_state { inline uint32_t process_inbound_packet() { uint32_t words_processed = 0; if (packet_is_for_local_chip()) { - if (current_packet_header.routing.flags == FORWARD && current_packet_header.session.command == ASYNC_WR) { - if (packet_in_progress == 0) { - packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; + if (current_packet_header.routing.flags == FORWARD) { + if (current_packet_header.session.command == ASYNC_WR) { + if (packet_in_progress == 0) { + packet_dest = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; + packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; + advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); + advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); + // subtract the header words. Remaining words are the data to be written to packet_dest. + // Remember to account for trailing bytes which may not be a full packet word. + packet_in_progress = 1; + words_processed = PACKET_HEADER_SIZE_WORDS; + words_processed += issue_async_write(); + } else { + flush_async_writes(); + if (packet_words_remaining) { + words_processed = issue_async_write(); + } else { + packet_in_progress = 0; + curr_packet_valid = false; + packet_timestamp = get_timestamp(); + } + } + } else if (current_packet_header.session.command == DSOCKET_WR) { + words_processed = pull_data_from_fvc_buffer(); + } + } else if (current_packet_header.routing.flags == INLINE_FORWARD) { + if (current_packet_header.session.command == SOCKET_CLOSE) { + words_processed = pull_data_from_fvc_buffer(); + } else { + uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | + current_packet_header.session.target_offset_l; + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + current_packet_header.packet_parameters.atomic_parameters.increment, + current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, + false); + packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); - // subtract the header words. Remaining words are the data to be written to packet_dest. - // Remember to account for trailing bytes which may not be a full packet word. - packet_in_progress = 1; words_processed = PACKET_HEADER_SIZE_WORDS; - words_processed += issue_async_write(); - } else { - flush_async_writes(); - if (packet_words_remaining) { - words_processed = issue_async_write(); - } else { - packet_in_progress = 0; - curr_packet_valid = false; - packet_timestamp = get_timestamp(); - } + fvc_pull_rdptr = fvc_out_rdptr; + update_remote_rdptr_cleared(); + curr_packet_valid = false; + packet_timestamp = get_timestamp(); } - } else if (current_packet_header.routing.flags == INLINE_FORWARD) { - uint64_t noc_addr = ((uint64_t)current_packet_header.session.target_offset_h << 32) | - current_packet_header.session.target_offset_l; - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - noc_addr, - NOC_UNICAST_WRITE_VC, - current_packet_header.packet_parameters.atomic_parameters.increment, - current_packet_header.packet_parameters.atomic_parameters.wrap_boundary, - false); - - packet_words_remaining -= PACKET_HEADER_SIZE_WORDS; - advance_out_wrptr(PACKET_HEADER_SIZE_WORDS); - advance_out_rdptr(PACKET_HEADER_SIZE_WORDS); - words_processed = PACKET_HEADER_SIZE_WORDS; - fvc_pull_rdptr = fvc_out_rdptr; - update_remote_rdptr_cleared(); - curr_packet_valid = false; - packet_timestamp = get_timestamp(); } } else { - words_processed = pull_data_from_fvc_buffer(); + words_processed = pull_data_from_fvc_buffer(); } return words_processed; } @@ -604,6 +620,518 @@ typedef struct fvc_producer_state { } } fvc_producer_state_t; +typedef struct fvcc_outbound_state { + volatile chan_payload_ptr remote_rdptr; + uint32_t remote_ptr_update_addr; + volatile ctrl_chan_msg_buf* + fvcc_buf; // fvcc buffer that receives messages that need to be forwarded over ethernet. + volatile ctrl_chan_sync_buf* fvcc_sync_buf; // sync buffer to hold pointer updates sent over ethernet + uint32_t remote_fvcc_buf_start; + uint32_t out_rdptr; + + // Check if ethernet receiver fvcc on neighboring chip is full. + bool is_remote_fvcc_full() { + uint32_t rd_ptr = remote_rdptr.ptr_cleared; + bool full = false; + if (out_rdptr != rd_ptr) { + uint32_t distance = out_rdptr >= rd_ptr ? out_rdptr - rd_ptr : out_rdptr + 2 * FVCC_BUF_SIZE - rd_ptr; + full = distance >= FVCC_BUF_SIZE; + } + return full; + } + + inline void init(uint32_t buf_start, uint32_t sync_buf_start, uint32_t remote_buf_start, uint32_t ptr_update_addr) { + uint32_t words = sizeof(fvcc_outbound_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; + fvcc_sync_buf = (volatile ctrl_chan_sync_buf*)sync_buf_start; + remote_fvcc_buf_start = remote_buf_start; + remote_ptr_update_addr = ptr_update_addr; + } + + inline uint32_t inc_ptr_with_wrap(uint32_t ptr) { return (ptr + 1) & FVCC_PTR_MASK; } + + inline void advance_out_rdptr() { out_rdptr = inc_ptr_with_wrap(out_rdptr); } + + inline void advance_fvcc_rdptr() { + uint32_t rd_ptr = remote_rdptr.ptr; + while (rd_ptr != fvcc_buf->rdptr.ptr) { + uint32_t msg_index = fvcc_buf->rdptr.ptr & FVCC_SIZE_MASK; + fvcc_buf->msg_buf[msg_index].packet_header.routing.flags = 0; + fvcc_buf->rdptr.ptr = inc_ptr_with_wrap(fvcc_buf->rdptr.ptr); + } + } + + template + inline uint32_t forward_data_from_fvcc_buffer() { + // If receiver ethernet fvcc is full, we cannot send more messages. + if (is_remote_fvcc_full()) { + return 0; + } + + if (fvcc_buf->wrptr.ptr != out_rdptr) { + // There are new messages to forward. + uint32_t msg_index = out_rdptr & FVCC_SIZE_MASK; + volatile packet_header_t* msg = &fvcc_buf->msg_buf[msg_index].packet_header; + bool msg_valid = msg->routing.flags != 0; + if (!msg_valid) { + return 0; + } + + uint32_t dest_addr = + remote_fvcc_buf_start + offsetof(ctrl_chan_msg_buf, msg_buf) + msg_index * sizeof(packet_header_t); + internal_::eth_send_packet( + 0, + (uint32_t)msg / PACKET_WORD_SIZE_BYTES, + dest_addr / PACKET_WORD_SIZE_BYTES, + PACKET_HEADER_SIZE_WORDS); + advance_out_rdptr(); + + uint32_t* sync_ptr = (uint32_t*)&fvcc_sync_buf->ptr[msg_index]; + sync_ptr[0] = out_rdptr; + sync_ptr[1] = 0; + sync_ptr[2] = 0; + sync_ptr[3] = out_rdptr; + internal_::eth_send_packet( + 0, ((uint32_t)sync_ptr) / PACKET_WORD_SIZE_BYTES, remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, 1); + } + + return PACKET_HEADER_SIZE_WORDS; + } + + inline void fvcc_handler() { + forward_data_from_fvcc_buffer(); + advance_fvcc_rdptr(); + } + +} fvcc_outbound_state_t; + +static_assert(sizeof(fvcc_outbound_state_t) % 4 == 0); + +// Fabric Virtual Control Channel (FVCC) Producer receives control/sync packets over ethernet from neighboring chip. +// Data in the producer is either destined for local chip, or has to make a noc hop +// to next outgoing ethernet port enroute to final destination. +// Control packets are forwarded to next fvcc consumer buffer in the route +// direction, if not meant for local device. +// If control packet is addressed to local device, FVCC producer can process the packet locally if +// it is a read/write ack, or forward the packet to Gatekeeper for further local processing. +typedef struct fvcc_inbound_state { + volatile chan_payload_ptr inbound_wrptr; + volatile chan_payload_ptr inbound_rdptr; + uint32_t remote_ptr_update_addr; + volatile ctrl_chan_msg_buf* fvcc_buf; // fvcc buffer that receives incoming control messages over ethernet. + uint8_t chan_num; + uint8_t packet_in_progress; + uint8_t pad1; + uint8_t pad2; + uint32_t fvc_out_wrptr; + uint32_t fvc_out_rdptr; + bool curr_packet_valid; + bool packet_corrupted; + uint64_t packet_timestamp; + uint64_t gk_fvcc_buf_addr; // fvcc buffer in gatekeeper. + volatile packet_header_t* current_packet_header; + + inline void init(uint32_t buf_start, uint32_t ptr_update_addr, uint64_t gk_fvcc_buf_start) { + uint32_t words = sizeof(fvcc_inbound_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + chan_num = 1; + fvcc_buf = (volatile ctrl_chan_msg_buf*)buf_start; + gk_fvcc_buf_addr = gk_fvcc_buf_start; + remote_ptr_update_addr = ptr_update_addr; + } + + inline uint32_t inc_ptr_with_wrap(uint32_t ptr, uint32_t inc) { return (ptr + inc) & FVCC_PTR_MASK; } + + inline void advance_local_wrptr(uint32_t inc) { inbound_wrptr.ptr = inc_ptr_with_wrap(inbound_wrptr.ptr, inc); } + + inline void advance_out_wrptr(uint32_t inc) { fvc_out_wrptr = inc_ptr_with_wrap(fvc_out_wrptr, inc); } + + inline void advance_out_rdptr(uint32_t inc) { fvc_out_rdptr = inc_ptr_with_wrap(fvc_out_rdptr, inc); } + + inline uint32_t get_num_msgs_available() const { + uint32_t wrptr = inbound_wrptr.ptr; + uint32_t msgs = 0; + if (fvc_out_rdptr != wrptr) { + msgs = wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : wrptr + 2 * FVCC_BUF_SIZE - fvc_out_rdptr; + } + return msgs; + } + + inline uint32_t get_num_msgs_free() { return FVCC_BUF_SIZE - get_num_msgs_available(); } + + inline uint32_t words_before_local_buffer_wrap() { + if (inbound_wrptr.ptr >= FVCC_BUF_SIZE) { + return (FVCC_BUF_SIZE * 2 - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; + } else { + return (FVCC_BUF_SIZE - inbound_wrptr.ptr) * PACKET_HEADER_SIZE_WORDS; + } + } + + inline bool get_curr_packet_valid() { + if (!curr_packet_valid && (get_num_msgs_available() >= 1)) { + uint32_t msg_index = fvc_out_rdptr & FVCC_SIZE_MASK; + bool msg_valid = fvcc_buf->msg_buf[msg_index].packet_header.routing.flags != 0; + if (msg_valid) { + current_packet_header = (volatile packet_header_t*)&fvcc_buf->msg_buf[msg_index]; + if (tt_fabric_is_header_valid((packet_header_t*)current_packet_header)) { + this->curr_packet_valid = true; + } else { + this->packet_corrupted = true; + } + } + } + return this->curr_packet_valid; + } + + inline uint32_t get_local_buffer_read_addr() { + uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[fvc_out_rdptr & FVCC_SIZE_MASK]; + return addr; + } + + inline uint32_t get_local_buffer_write_addr() { + uint32_t addr = (uint32_t)&fvcc_buf->msg_buf[inbound_wrptr.ptr & FVCC_SIZE_MASK]; + ; + return addr; + } + + template + inline void update_remote_rdptr_sent() { + if (inbound_wrptr.ptr_cleared != inbound_rdptr.ptr) { + inbound_rdptr.ptr = inbound_wrptr.ptr_cleared; + if constexpr (fvc_mode == FVC_MODE_ROUTER) { + internal_::eth_send_packet( + 0, + ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, + remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, + 1); + } + } + } + + template + inline void update_remote_rdptr_cleared() { + if (fvc_out_rdptr != inbound_rdptr.ptr_cleared) { + inbound_rdptr.ptr_cleared = fvc_out_rdptr; + if constexpr (fvc_mode == FVC_MODE_ROUTER) { + internal_::eth_send_packet( + 0, + ((uint32_t)&inbound_rdptr) / PACKET_WORD_SIZE_BYTES, + remote_ptr_update_addr / PACKET_WORD_SIZE_BYTES, + 1); + } + } + } + + uint32_t get_next_hop_router_noc_xy() { + uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; + if (dst_mesh_id != routing_table->my_mesh_id) { + uint32_t next_port = routing_table->inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; + uint32_t next_port = routing_table->intra_mesh_table.dest_entry[dst_device_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } + } + + inline bool packet_is_for_local_chip() { + return (current_packet_header->routing.dst_mesh_id == routing_table->my_mesh_id) && + (current_packet_header->routing.dst_dev_id == routing_table->my_device_id); + } + + // issue a pull request. + // currently blocks till the request queue has space. + // This needs to be non blocking, so that if one fvc pull request queue is full, + // we can process other fvcs and come back to check status of this pull request later. + inline void forward_message(uint64_t dest_addr) { + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&fvcc_buf->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + uint32_t wrptr = fvcc_buf->wrptr.ptr; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)(&fvcc_buf->rdptr.ptr), 4); + noc_async_read_barrier(); + if (!fvcc_buf_ptrs_full(wrptr, fvcc_buf->rdptr.ptr)) { + break; + } +#if defined(COMPILE_FOR_ERISC) + else { + // Consumer pull request buffer is full + // Context switch to enable base firmware routing + // as it might be handling slow dispatch traffic + internal_::risc_context_switch(); + } +#endif + } + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(current_packet_header), noc_addr, sizeof(packet_header_t), noc_index); + } + + template + inline void process_inbound_packet() { + if (packet_is_for_local_chip()) { + if (current_packet_header->routing.flags == SYNC) { + if (current_packet_header->session.command == SOCKET_OPEN or + current_packet_header->session.command == SOCKET_CONNECT) { + // forward socket related messages to gatekeeper. + forward_message(gk_fvcc_buf_addr); + } else if (current_packet_header->session.command == ASYNC_WR_RESP) { + // Write response. Decrement transaction count for respective transaction id. + uint64_t noc_addr = ((uint64_t)current_packet_header->session.target_offset_h << 32) | + current_packet_header->session.target_offset_l; + noc_fast_atomic_increment( + noc_index, NCRISC_AT_CMD_BUF, noc_addr, NOC_UNICAST_WRITE_VC, -1, 31, false); + } + } + } else { + // Control message is not meant for local chip. Forward to next router enroute to destination. + uint64_t dest_addr = ((uint64_t)get_next_hop_router_noc_xy() << 32) | FVCC_OUT_BUF_START; + forward_message(dest_addr); + } + curr_packet_valid = false; + advance_out_wrptr(1); + advance_out_rdptr(1); + noc_async_write_barrier(); + update_remote_rdptr_cleared(); + } + + template + inline void fvcc_handler() { + if (get_curr_packet_valid()) { + process_inbound_packet(); + } + update_remote_rdptr_sent(); + } + +} fvcc_inbound_state_t; + +typedef struct socket_reader_state { + volatile chan_payload_ptr remote_rdptr; + uint8_t packet_in_progress; + + uint32_t packet_words_remaining; + uint32_t fvc_out_wrptr; + uint32_t fvc_out_rdptr; + uint32_t fvc_pull_wrptr; + uint32_t buffer_size; + uint32_t buffer_start; + uint32_t remote_buffer_start; + uint32_t pull_words_in_flight; + uint32_t words_since_last_sync; + + uint32_t get_num_words_free() { + uint32_t rd_ptr = remote_rdptr.ptr; + uint32_t words_occupied = 0; + if (fvc_pull_wrptr != rd_ptr) { + words_occupied = + fvc_pull_wrptr > rd_ptr ? fvc_pull_wrptr - rd_ptr : buffer_size * 2 + fvc_pull_wrptr - rd_ptr; + } + return buffer_size - words_occupied; + } + + uint32_t get_remote_num_words_free() { + uint32_t rd_ptr = remote_rdptr.ptr_cleared; + uint32_t words_occupied = 0; + if (fvc_out_wrptr != rd_ptr) { + words_occupied = fvc_out_wrptr > rd_ptr ? fvc_out_wrptr - rd_ptr : buffer_size * 2 + fvc_out_wrptr - rd_ptr; + } + return buffer_size - words_occupied; + } + + inline void init(uint32_t data_buf_start, uint32_t data_buf_size_words) { + uint32_t words = sizeof(socket_reader_state) / 4; + uint32_t* ptr = (uint32_t*)this; + for (uint32_t i = 0; i < words; i++) { + ptr[i] = 0; + } + buffer_start = data_buf_start; + buffer_size = data_buf_size_words; + remote_buffer_start = data_buf_start + buffer_size * PACKET_WORD_SIZE_BYTES; + } + + inline uint32_t words_before_local_buffer_wrap() { + if (fvc_pull_wrptr >= buffer_size) { + return buffer_size * 2 - fvc_pull_wrptr; + } else { + return buffer_size - fvc_pull_wrptr; + } + } + + inline uint32_t get_local_buffer_pull_addr() { + uint32_t addr = buffer_start; + uint32_t offset = fvc_pull_wrptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline uint32_t get_local_buffer_read_addr() { + uint32_t addr = buffer_start; + uint32_t offset = fvc_out_rdptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline uint32_t get_remote_buffer_write_addr() { + uint32_t addr = remote_buffer_start; + uint32_t offset = fvc_out_wrptr; + if (offset >= buffer_size) { + offset -= buffer_size; + } + addr = addr + (offset * PACKET_WORD_SIZE_BYTES); + return addr; + } + + inline void advance_pull_wrptr(uint32_t num_words) { + uint32_t temp = fvc_pull_wrptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_pull_wrptr = temp; + } + + inline void advance_out_wrptr(uint32_t num_words) { + uint32_t temp = fvc_out_wrptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_out_wrptr = temp; + } + + inline void advance_out_rdptr(uint32_t num_words) { + uint32_t temp = fvc_out_rdptr + num_words; + if (temp >= buffer_size * 2) { + temp -= buffer_size * 2; + } + fvc_out_rdptr = temp; + } + + inline void register_pull_data(uint32_t num_words_to_pull) { + pull_words_in_flight += num_words_to_pull; + advance_pull_wrptr(num_words_to_pull); + words_since_last_sync += num_words_to_pull; + packet_words_remaining -= num_words_to_pull; + } + + inline uint32_t get_num_words_to_pull(volatile pull_request_t* pull_request) { + uint32_t num_words_to_pull = num_words_available_to_pull(pull_request); + uint32_t num_words_before_wrap = words_before_buffer_wrap(pull_request->buffer_size, pull_request->rd_ptr); + + num_words_to_pull = std::min(num_words_to_pull, num_words_before_wrap); + uint32_t socket_buffer_space = get_num_words_free(); + num_words_to_pull = std::min(num_words_to_pull, socket_buffer_space); + + if (num_words_to_pull == 0) { + return 0; + } + + uint32_t space_before_wptr_wrap = words_before_local_buffer_wrap(); + num_words_to_pull = std::min(num_words_to_pull, space_before_wptr_wrap); + + return num_words_to_pull; + } + + inline uint32_t pull_socket_data(volatile pull_request_t* pull_request) { + volatile uint32_t* temp = (volatile uint32_t*)0xffb2010c; + if (packet_in_progress == 0) { + uint32_t size = pull_request->size; + packet_words_remaining = (size + PACKET_WORD_SIZE_BYTES - 1) >> 4; + packet_in_progress = 1; + } + + uint32_t num_words_to_pull = get_num_words_to_pull(pull_request); + if (num_words_to_pull == 0) { + temp[0] = 0xdead1111; + return 0; + } + + uint32_t rd_offset = get_rd_ptr_offset_words((pull_request_t*)pull_request); + uint64_t src_addr = pull_request->buffer_start + (rd_offset * PACKET_WORD_SIZE_BYTES); + uint32_t local_addr = get_local_buffer_pull_addr(); + + // pull_data_from_remote(); + noc_async_read(src_addr, local_addr, num_words_to_pull * PACKET_WORD_SIZE_BYTES); + register_pull_data(num_words_to_pull); + pull_request->rd_ptr = advance_ptr(pull_request->buffer_size, pull_request->rd_ptr, num_words_to_pull); + + return num_words_to_pull; + } + + template + inline uint32_t push_socket_data() { + uint32_t total_words_to_forward = 0; + uint32_t wrptr = fvc_pull_wrptr; + + total_words_to_forward = + wrptr > fvc_out_rdptr ? wrptr - fvc_out_rdptr : buffer_size * 2 + wrptr - fvc_out_rdptr; + + uint32_t remote_fvc_buffer_space = get_remote_num_words_free(); + total_words_to_forward = std::min(total_words_to_forward, remote_fvc_buffer_space); + if (total_words_to_forward == 0) { + return 0; + } + + if (packet_words_remaining and (words_since_last_sync < FVC_SYNC_THRESHOLD)) { + // not enough data to forward. + // wait for more data. + return 0; + } + + if constexpr (live == true) { + uint32_t src_addr = 0; + uint32_t dest_addr = 0; // should be second half of fvc buffer. + uint32_t words_remaining = total_words_to_forward; + while (words_remaining) { + uint32_t num_words_before_local_wrap = words_before_buffer_wrap(buffer_size, fvc_out_rdptr); + uint32_t num_words_before_remote_wrap = words_before_buffer_wrap(buffer_size, fvc_out_wrptr); + uint32_t words_to_forward = std::min(num_words_before_local_wrap, num_words_before_remote_wrap); + words_to_forward = std::min(words_to_forward, words_remaining); + // max 8K bytes + words_to_forward = std::min(words_to_forward, DEFAULT_MAX_NOC_SEND_WORDS); + src_addr = get_local_buffer_read_addr(); + dest_addr = get_remote_buffer_write_addr(); + + // TODO: Issue a noc write here to forward data. + + advance_out_rdptr(words_to_forward); + advance_out_wrptr(words_to_forward); + words_remaining -= words_to_forward; + } + } else { + advance_out_rdptr(total_words_to_forward); + advance_out_wrptr(total_words_to_forward); + remote_rdptr.ptr = fvc_out_rdptr; + remote_rdptr.ptr_cleared = fvc_out_wrptr; + } + words_since_last_sync -= total_words_to_forward; + return total_words_to_forward; + } +} socket_reader_state_t; + +static_assert(sizeof(socket_reader_state_t) % 4 == 0); typedef struct router_state { uint32_t sync_in; @@ -615,28 +1143,6 @@ typedef struct router_state { inline uint64_t get_timestamp_32b() { return reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); } -void tt_fabric_add_header_checksum(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - p_header->packet_parameters.misc_parameters.words[0] = sum; -} - -bool tt_fabric_is_header_valid(packet_header_t* p_header) { - uint16_t* ptr = (uint16_t*)p_header; - uint32_t sum = 0; - for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { - sum += ptr[i]; - } - sum = ~sum; - sum += sum; - return (p_header->packet_parameters.misc_parameters.words[0] == sum); -} - void zero_l1_buf(tt_l1_ptr uint32_t* buf, uint32_t size_bytes) { for (uint32_t i = 0; i < size_bytes / 4; i++) { buf[i] = 0; diff --git a/tt_fabric/hw/inc/tt_fabric_api.h b/tt_fabric/hw/inc/tt_fabric_api.h index 6978f58ff9d..fdad852a35d 100644 --- a/tt_fabric/hw/inc/tt_fabric_api.h +++ b/tt_fabric/hw/inc/tt_fabric_api.h @@ -9,17 +9,14 @@ #include "dataflow_api.h" #include "noc_overlay_parameters.h" #include "ethernet/dataflow_api.h" -#include "tt_fabric.hpp" +#include "tt_fabric_interface.h" -typedef struct tt_fabric_endpoint_sync { - uint32_t sync_addr : 24; - uint32_t endpoint_type : 8 -} tt_fabric_endpoint_sync_t; +using namespace tt::tt_fabric; -static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); - -extern local_pull_request_t local_pull_request; +extern volatile local_pull_request_t* local_pull_request; +extern volatile fabric_client_interface_t* client_interface; +/* inline void fabric_async_write( uint32_t routing_plane, // the network plane to use for this transaction uint32_t src_addr, // source address in sender’s memory @@ -40,56 +37,125 @@ inline void fabric_async_write( uint64_t router_request_queue = NOC_XY_ADDR(2, 0, 0x19000); tt_fabric_send_pull_request(router_request_queue, &local_pull_request); } +*/ -/** - * Polling for ready signal from the remote peers of all input and output queues. - * Blocks until all are ready, but doesn't block polling on each individual queue. - * Returns false in case of timeout. - */ -bool wait_all_src_dest_ready(volatile router_state_t* router_state, uint32_t timeout_cycles = 0) { - bool src_ready = false; - bool dest_ready = false; +inline uint32_t get_next_hop_router_noc_xy(uint32_t routing_plane, uint32_t dst_mesh_id, uint32_t dst_dev_id) { + if (dst_mesh_id != routing_table[routing_plane].my_mesh_id) { + uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_dev_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } +} - uint32_t iters = 0; +inline void send_message_to_gk() { + uint64_t gk_noc_base = client_interface->gk_msg_buf_addr; + uint64_t noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&client_interface->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + uint32_t wrptr = client_interface->wrptr.ptr; + noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, rdptr); + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)(&client_interface->rdptr.ptr), 4); + noc_async_read_barrier(); + if (!fvcc_buf_ptrs_full(wrptr, client_interface->rdptr.ptr)) { + break; + } + } + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = gk_noc_base + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(&client_interface->gk_message), noc_addr, sizeof(packet_header_t), noc_index); + noc_async_write_barrier(); +} - uint32_t start_timestamp = get_timestamp_32b(); - uint32_t sync_in_addr = ((uint32_t)&router_state->sync_in) / PACKET_WORD_SIZE_BYTES; - uint32_t sync_out_addr = ((uint32_t)&router_state->sync_out) / PACKET_WORD_SIZE_BYTES; +inline socket_handle_t* fabric_socket_open( + uint32_t routing_plane, // the network plane to use for this socket + uint16_t epoch_id, // Temporal epoch for which the socket is being opened + uint16_t socket_id, // Socket Id to open + uint8_t socket_type, // Unicast, Multicast, SSocket, DSocket + uint8_t direction, // Send or Receive + uint16_t remote_mesh_id, // Remote mesh/device that is the socket data sender/receiver. + uint16_t remote_dev_id, + uint8_t fvc // fabric virtual channel. +) { + uint32_t socket_count = client_interface->socket_count; + socket_handle_t* socket_handle = (socket_handle_t*)&client_interface->socket_handles[socket_count]; + socket_count++; + client_interface->socket_count = socket_count; + socket_handle->socket_state = SocketState::OPENING; - uint32_t scratch_addr = ((uint32_t)&router_state->scratch) / PACKET_WORD_SIZE_BYTES; - router_state->scratch[0] = 0xAA; - // send_buf[1] = 0x0; - // send_buf[2] = 0x0; - // send_buf[3] = 0x0; + if (direction == SOCKET_DIRECTION_SEND) { + client_interface->gk_message.packet_header.routing.dst_mesh_id = remote_mesh_id; + client_interface->gk_message.packet_header.routing.dst_dev_id = remote_dev_id; + } else { + client_interface->gk_message.packet_header.routing.src_mesh_id = remote_mesh_id; + client_interface->gk_message.packet_header.routing.src_dev_id = remote_dev_id; + } + client_interface->gk_message.packet_header.routing.flags = SYNC; + client_interface->gk_message.packet_header.session.command = SOCKET_OPEN; + client_interface->gk_message.packet_header.session.target_offset_h = client_interface->pull_req_buf_addr >> 32; + client_interface->gk_message.packet_header.session.target_offset_l = (uint32_t)client_interface->pull_req_buf_addr; + client_interface->gk_message.packet_header.session.ack_offset_h = xy_local_addr >> 32; + client_interface->gk_message.packet_header.session.ack_offset_l = (uint32_t)socket_handle; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_id = socket_id; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.epoch_id = epoch_id; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_type = socket_type; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.socket_direction = direction; + client_interface->gk_message.packet_header.packet_parameters.socket_parameters.routing_plane = routing_plane; + tt_fabric_add_header_checksum((packet_header_t*)&client_interface->gk_message.packet_header); + send_message_to_gk(); + return socket_handle; +} - while (!src_ready or !dest_ready) { - if (router_state->sync_out != 0xAA) { - internal_::eth_send_packet(0, scratch_addr, sync_in_addr, 1); - } else { - dest_ready = true; - } +inline void fabric_socket_close(socket_handle_t* socket_handle) { + packet_header_t* packet_header = (packet_header_t*)&client_interface->gk_message.packet_header; + uint32_t dst_mesh_id = socket_handle->rcvr_mesh_id; + uint32_t dst_dev_id = socket_handle->rcvr_dev_id; + packet_header->routing.flags = INLINE_FORWARD; + packet_header->routing.dst_mesh_id = dst_mesh_id; + packet_header->routing.dst_dev_id = dst_dev_id; + packet_header->routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + packet_header->session.command = SOCKET_CLOSE; + packet_header->session.target_offset_l = (uint32_t)socket_handle->pull_notification_adddr; + packet_header->session.target_offset_h = socket_handle->pull_notification_adddr >> 32; + tt_fabric_add_header_checksum(packet_header); - if (!src_ready && router_state->sync_in == 0xAA) { - internal_::eth_send_packet(0, sync_in_addr, sync_out_addr, 1); - src_ready = true; - } + uint32_t* dst = (uint32_t*)&client_interface->local_pull_request.pull_request; + uint32_t* src = (uint32_t*)packet_header; + for (uint32_t i = 0; i < sizeof(pull_request_t) / 4; i++) { + dst[i] = src[i]; + } + uint64_t dest_addr = + ((uint64_t)get_next_hop_router_noc_xy(socket_handle->routing_plane, dst_mesh_id, dst_dev_id) << 32) | + FABRIC_ROUTER_REQ_QUEUE_START; + tt_fabric_send_pull_request(dest_addr, (volatile local_pull_request_t*)&client_interface->local_pull_request); +} - iters++; - if (timeout_cycles > 0) { - uint32_t cycles_since_start = get_timestamp_32b() - start_timestamp; - if (cycles_since_start > timeout_cycles) { - return false; - } - } +inline void fabric_socket_connect(socket_handle_t* socket_handle) { + // wait for socket state to change to Active. + // Gatekeeper will update local socket handle when the receiver for send socket + // is ready. + while (((volatile socket_handle_t*)socket_handle)->socket_state != SocketState::ACTIVE); +} -#if defined(COMPILE_FOR_ERISC) - if ((timeout_cycles == 0) && (iters & 0xFFF) == 0) { - // if timeout is disabled, context switch every 4096 iterations. - // this is necessary to allow ethernet routing layer to operate. - // this core may have pending ethernet routing work. - internal_::risc_context_switch(); +inline void fabric_endpoint_init() { + uint64_t noc_addr = client_interface->gk_interface_addr + offsetof(gatekeeper_info_t, ep_sync); + client_interface->return_status[0] = 0; + while (1) { + noc_async_read_one_packet(noc_addr, (uint32_t)&client_interface->return_status[0], 4); + noc_async_read_barrier(); + if (client_interface->return_status[0] != 0) { + break; } -#endif } - return true; } diff --git a/tt_fabric/hw/inc/tt_fabric_interface.h b/tt_fabric/hw/inc/tt_fabric_interface.h index affefa22e20..9fd7d030178 100644 --- a/tt_fabric/hw/inc/tt_fabric_interface.h +++ b/tt_fabric/hw/inc/tt_fabric_interface.h @@ -7,17 +7,18 @@ #include "eth_l1_address_map.h" #include "noc/noc_parameters.h" -typedef struct tt_fabric_endpoint_sync { +namespace tt::tt_fabric { + +typedef struct _endpoint_sync { uint32_t sync_addr : 24; uint32_t endpoint_type : 8; -} tt_fabric_endpoint_sync_t; +} endpoint_sync_t; -static_assert(sizeof(tt_fabric_endpoint_sync_t) == 4); +static_assert(sizeof(endpoint_sync_t) == 4); constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t NUM_WR_CMD_BUFS = 4; -constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = - (NUM_WR_CMD_BUFS - 1) * (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; +constexpr uint32_t DEFAULT_MAX_NOC_SEND_WORDS = (NOC_MAX_BURST_WORDS * NOC_WORD_BYTES) / PACKET_WORD_SIZE_BYTES; constexpr uint32_t DEFAULT_MAX_ETH_SEND_WORDS = 2 * 1024; constexpr uint32_t FVC_SYNC_THRESHOLD = 256; @@ -30,6 +31,9 @@ enum SessionCommand : uint32_t { SSOCKET_WR = (0x1 << 5), ATOMIC_INC = (0x1 << 6), ATOMIC_READ_INC = (0x1 << 7), + SOCKET_OPEN = (0x1 << 8), + SOCKET_CLOSE = (0x1 << 9), + SOCKET_CONNECT = (0x1 << 10), }; #define INVALID 0x0 @@ -42,7 +46,7 @@ enum SessionCommand : uint32_t { #define TERMINATE 0x40 #define NOP 0xFF -typedef struct tt_routing { +typedef struct _tt_routing { uint32_t packet_size_bytes; uint16_t dst_mesh_id; // Remote mesh uint16_t dst_dev_id; // Remote device @@ -55,7 +59,7 @@ typedef struct tt_routing { static_assert(sizeof(tt_routing) == 16); -typedef struct tt_session { +typedef struct _tt_session { SessionCommand command; uint32_t target_offset_l; // RDMA address uint32_t target_offset_h; @@ -67,7 +71,7 @@ typedef struct tt_session { static_assert(sizeof(tt_session) == 20); -typedef struct mcast_params { +typedef struct _mcast_params { uint16_t east; uint16_t west; uint16_t north; @@ -75,11 +79,17 @@ typedef struct mcast_params { uint32_t socket_id; // Socket Id for DSocket Multicast. Ignored for ASYNC multicast. } mcast_params; -typedef struct socket_params { - uint32_t socket_id; +typedef struct _socket_params { + uint32_t padding1; + uint16_t socket_id; + uint16_t epoch_id; + uint8_t socket_type; + uint8_t socket_direction; + uint8_t routing_plane; + uint8_t padding; } socket_params; -typedef struct atomic_params { +typedef struct _atomic_params { uint32_t padding; uint32_t return_offset; // L1 offset where atomic read should be returned. Noc X/Y is taken from tt_session.ack_offset @@ -87,17 +97,17 @@ typedef struct atomic_params { uint32_t wrap_boundary : 8; } atomic_params; -typedef struct read_params { +typedef struct _read_params { uint32_t return_offset_l; // address where read data should be copied uint32_t return_offset_h; uint32_t size; // number of bytes to read } read_params; -typedef struct misc_params { +typedef struct _misc_params { uint32_t words[3]; } misc_params; -typedef union packet_params { +typedef union _packet_params { mcast_params mcast_parameters; socket_params socket_parameters; atomic_params atomic_parameters; @@ -106,7 +116,7 @@ typedef union packet_params { uint8_t bytes[12]; } packet_params; -typedef struct packet_header { +typedef struct _packet_header { packet_params packet_parameters; tt_session session; tt_routing routing; @@ -115,7 +125,29 @@ typedef struct packet_header { const uint32_t PACKET_HEADER_SIZE_BYTES = 48; const uint32_t PACKET_HEADER_SIZE_WORDS = PACKET_HEADER_SIZE_BYTES / PACKET_WORD_SIZE_BYTES; -static_assert(sizeof(packet_header) == PACKET_HEADER_SIZE_BYTES); +static_assert(sizeof(packet_header_t) == PACKET_HEADER_SIZE_BYTES); + +void tt_fabric_add_header_checksum(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + p_header->packet_parameters.misc_parameters.words[0] = sum; +} + +bool tt_fabric_is_header_valid(packet_header_t* p_header) { + uint16_t* ptr = (uint16_t*)p_header; + uint32_t sum = 0; + for (uint32_t i = 2; i < sizeof(packet_header_t) / 2; i++) { + sum += ptr[i]; + } + sum = ~sum; + sum += sum; + return (p_header->packet_parameters.misc_parameters.words[0] == sum); +} // This is a pull request entry for a fabric router. // Pull request issuer populates these entries to identify @@ -129,7 +161,7 @@ static_assert(sizeof(packet_header) == PACKET_HEADER_SIZE_BYTES); // request queue. // This is typical of fabric routers forwarding data over noc/ethernet hops. // -typedef struct pull_request { +typedef struct _pull_request { uint32_t wr_ptr; // Current value of write pointer. uint32_t rd_ptr; // Current value of read pointer. Points to first byte of pull data. uint32_t size; // Total number of bytes that need to be forwarded. @@ -144,17 +176,17 @@ typedef struct pull_request { const uint32_t PULL_REQ_SIZE_BYTES = 48; -static_assert(sizeof(pull_request) == PULL_REQ_SIZE_BYTES); -static_assert(sizeof(pull_request) == sizeof(packet_header)); +static_assert(sizeof(pull_request_t) == PULL_REQ_SIZE_BYTES); +static_assert(sizeof(pull_request_t) == sizeof(packet_header_t)); -typedef union chan_request_entry { +typedef union _chan_request_entry { pull_request_t pull_request; packet_header_t packet_header; uint8_t bytes[48]; } chan_request_entry_t; const uint32_t CHAN_PTR_SIZE_BYTES = 16; -typedef struct chan_ptr { +typedef struct _chan_ptr { uint32_t ptr; uint32_t pad[3]; } chan_ptr; @@ -166,7 +198,7 @@ const uint32_t CHAN_REQ_BUF_SIZE_MASK = (CHAN_REQ_BUF_SIZE - 1); const uint32_t CHAN_REQ_BUF_PTR_MASK = ((CHAN_REQ_BUF_SIZE << 1) - 1); const uint32_t CHAN_REQ_BUF_SIZE_BYTES = 2 * CHAN_PTR_SIZE_BYTES + CHAN_REQ_BUF_SIZE * PULL_REQ_SIZE_BYTES; -typedef struct chan_req_buf { +typedef struct _chan_req_buf { chan_ptr wrptr; chan_ptr rdptr; chan_request_entry_t chan_req[CHAN_REQ_BUF_SIZE]; @@ -174,13 +206,13 @@ typedef struct chan_req_buf { static_assert(sizeof(chan_req_buf) == CHAN_REQ_BUF_SIZE_BYTES); -typedef struct local_pull_request { +typedef struct _local_pull_request { chan_ptr wrptr; chan_ptr rdptr; pull_request_t pull_request; } local_pull_request_t; -typedef struct chan_payload_ptr { +typedef struct _chan_payload_ptr { uint32_t ptr; uint32_t pad[2]; uint32_t ptr_cleared; @@ -188,11 +220,129 @@ typedef struct chan_payload_ptr { static_assert(sizeof(chan_payload_ptr) == CHAN_PTR_SIZE_BYTES); +// Fabric Virtual Control Channel (FVCC) parameters. +// Each control channel message is 48 Bytes. +// FVCC buffer is a 16 message buffer each for incoming and outgoing messages. +// Control message capacity can be increased by increasing FVCC_BUF_SIZE. +const uint32_t FVCC_BUF_SIZE = 16; // must be 2^N +const uint32_t FVCC_BUF_LOG_SIZE = 4; // must be log2(FVCC_BUF_SIZE) +const uint32_t FVCC_SIZE_MASK = (FVCC_BUF_SIZE - 1); +const uint32_t FVCC_PTR_MASK = ((FVCC_BUF_SIZE << 1) - 1); +const uint32_t FVCC_BUF_SIZE_BYTES = PULL_REQ_SIZE_BYTES * FVCC_BUF_SIZE + 2 * CHAN_PTR_SIZE_BYTES; +const uint32_t FVCC_SYNC_BUF_SIZE_BYTES = CHAN_PTR_SIZE_BYTES * FVCC_BUF_SIZE; + +inline bool fvcc_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } + +inline bool fvcc_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { + uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; + return !fvcc_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); +} + +// out_req_buf has 16 byte additional storage per entry to hold the outgoing +// write pointer update. this is sent over ethernet. +// For incoming requests over ethernet, we only need storate for the request +// entry. The pointer update goes to fvcc state. +typedef struct _ctrl_chan_msg_buf { + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t msg_buf[FVCC_BUF_SIZE]; +} ctrl_chan_msg_buf; + +typedef struct _ctrl_chan_sync_buf { + chan_payload_ptr ptr[FVCC_BUF_SIZE]; +} ctrl_chan_sync_buf; + +static_assert(sizeof(ctrl_chan_msg_buf) == FVCC_BUF_SIZE_BYTES); + +typedef struct _sync_word { + uint32_t val; + uint32_t padding[3]; +} sync_word_t; + +typedef struct _gatekeeper_info { + sync_word_t router_sync; + sync_word_t ep_sync; + uint32_t routing_planes; + uint32_t padding[3]; + ctrl_chan_msg_buf gk_msg_buf; +} gatekeeper_info_t; + +#define SOCKET_DIRECTION_SEND 1 +#define SOCKET_DIRECTION_RECV 2 +#define SOCKET_TYPE_DGRAM 1 +#define SOCKET_TYPE_STREAM 2 + +enum SocketState : uint8_t { + IDLE = 0, + OPENING = 1, + ACTIVE = 2, + CLOSING = 3, +}; + +typedef struct _socket_handle { + uint16_t socket_id; + uint16_t epoch_id; + uint8_t socket_state; + uint8_t socket_type; + uint8_t socket_direction; + uint8_t rcvrs_ready; + uint32_t routing_plane; + uint16_t sender_mesh_id; + uint16_t sender_dev_id; + uint16_t rcvr_mesh_id; + uint16_t rcvr_dev_id; + uint32_t sender_handle; + uint64_t pull_notification_adddr; + uint64_t status_notification_addr; + uint32_t padding[2]; +} socket_handle_t; + +static_assert(sizeof(socket_handle_t) % 16 == 0); + +constexpr uint32_t MAX_SOCKETS = 64; +typedef struct _socket_info { + uint32_t socket_count; + uint32_t socket_setup_pending; + uint32_t padding[2]; + socket_handle_t sockets[MAX_SOCKETS]; + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t gk_message; +} socket_info_t; +static_assert(sizeof(socket_info_t) % 16 == 0); + +typedef struct _fabric_client_interface { + uint64_t gk_interface_addr; + uint64_t gk_msg_buf_addr; + uint64_t pull_req_buf_addr; + uint32_t padding[2]; + uint32_t return_status[3]; + uint32_t socket_count; + chan_ptr wrptr; + chan_ptr rdptr; + chan_request_entry_t gk_message; + local_pull_request_t local_pull_request; + socket_handle_t socket_handles[MAX_SOCKETS]; +} fabric_client_interface_t; + +static_assert(sizeof(fabric_client_interface_t) % 16 == 0); + constexpr uint32_t FABRIC_ROUTER_MISC_START = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; constexpr uint32_t FABRIC_ROUTER_MISC_SIZE = 256; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM = FABRIC_ROUTER_MISC_START; constexpr uint32_t FABRIC_ROUTER_SYNC_SEM_SIZE = 16; -constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; +// Fabric Virtual Control Channel start/size +constexpr uint32_t FVCC_OUT_BUF_START = FABRIC_ROUTER_MISC_START + FABRIC_ROUTER_MISC_SIZE; +constexpr uint32_t FVCC_OUT_BUF_SIZE = FVCC_BUF_SIZE_BYTES; +constexpr uint32_t FVCC_SYNC_BUF_START = FVCC_OUT_BUF_START + FVCC_OUT_BUF_SIZE; +constexpr uint32_t FVCC_SYNC_BUF_SIZE = FVCC_SYNC_BUF_SIZE_BYTES; +constexpr uint32_t FVCC_IN_BUF_START = FVCC_SYNC_BUF_START + FVCC_SYNC_BUF_SIZE; +constexpr uint32_t FVCC_IN_BUF_SIZE = FVCC_BUF_SIZE_BYTES; + +// Fabric Virtual Channel start/size +constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_START = FVCC_IN_BUF_START + FVCC_IN_BUF_SIZE; constexpr uint32_t FABRIC_ROUTER_REQ_QUEUE_SIZE = sizeof(chan_req_buf); constexpr uint32_t FABRIC_ROUTER_DATA_BUF_START = FABRIC_ROUTER_REQ_QUEUE_START + FABRIC_ROUTER_REQ_QUEUE_SIZE; + +} // namespace tt::tt_fabric diff --git a/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp new file mode 100644 index 00000000000..b90892d5e5b --- /dev/null +++ b/tt_fabric/impl/kernels/tt_fabric_gatekeeper.cpp @@ -0,0 +1,508 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// clang-format off +#include "dataflow_api.h" +#include "tt_fabric/hw/inc/tt_fabric.h" +#include "debug/dprint.h" +// clang-format on + +using namespace tt::tt_fabric; + +constexpr uint32_t gatekeeper_info_addr = get_compile_time_arg_val(0); +constexpr uint32_t socket_info_addr = get_compile_time_arg_val(1); +constexpr uint32_t routing_table_addr = get_compile_time_arg_val(2); +constexpr uint32_t kernel_status_buf_addr = get_compile_time_arg_val(3); +constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(4); +constexpr uint32_t timeout_cycles = get_compile_time_arg_val(5); +uint32_t sync_val; +uint32_t router_mask; + +constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; +constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; +constexpr uint32_t PACKET_QUEUE_TEST_PASS = PACKET_QUEUE_STAUS_MASK | 0x1; +constexpr uint32_t PACKET_QUEUE_TEST_TIMEOUT = PACKET_QUEUE_STAUS_MASK | 0xdead0; +constexpr uint32_t PACKET_QUEUE_TEST_BAD_HEADER = PACKET_QUEUE_STAUS_MASK | 0xdead1; +constexpr uint32_t PACKET_QUEUE_TEST_DATA_MISMATCH = PACKET_QUEUE_STAUS_MASK | 0x3; + +// indexes of return values in test results buffer +constexpr uint32_t PQ_TEST_STATUS_INDEX = 0; +constexpr uint32_t PQ_TEST_WORD_CNT_INDEX = 2; +constexpr uint32_t PQ_TEST_CYCLES_INDEX = 4; +constexpr uint32_t PQ_TEST_ITER_INDEX = 6; +constexpr uint32_t PQ_TEST_MISC_INDEX = 16; + +// careful, may be null +tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr); +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = + reinterpret_cast(routing_table_addr); + +volatile gatekeeper_info_t* gk_info = (volatile gatekeeper_info_t*)gatekeeper_info_addr; +volatile socket_info_t* socket_info = (volatile socket_info_t*)socket_info_addr; + +uint64_t xy_local_addr; +uint64_t router_addr; +uint32_t gk_message_pending; + +inline void notify_all_routers(uint32_t notification) { + uint32_t remaining_cores = router_mask; + for (uint32_t i = 0; i < 16; i++) { + if (remaining_cores == 0) { + break; + } + if (remaining_cores & (0x1 << i)) { + uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; + noc_inline_dw_write(dest_addr, notification); + remaining_cores &= ~(0x1 << i); + } + } +} + +inline void sync_all_routers() { + // wait for all device routers to have incremented the sync semaphore. + // sync_val is equal to number of tt-fabric routers running on a device. + while (gk_info->router_sync.val != sync_val); + + // send semaphore increment to all fabric routers on this device. + // semaphore notifies all other routers that this router has completed + // startup handshake with its ethernet peer. + notify_all_routers(sync_val); + gk_info->ep_sync.val = sync_val; +} + +inline void gk_msg_buf_ptr_advance(chan_ptr* ptr) { ptr->ptr = (ptr->ptr + 1) & FVCC_PTR_MASK; } + +inline void gk_msg_buf_advance_wrptr(ctrl_chan_msg_buf* msg_buf) { gk_msg_buf_ptr_advance(&(msg_buf->wrptr)); } + +inline void gk_msg_buf_advance_rdptr(ctrl_chan_msg_buf* msg_buf) { + // clear valid before incrementing read pointer. + uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + msg_buf->msg_buf[rd_index].bytes[47] = 0; + gk_msg_buf_ptr_advance(&(msg_buf->rdptr)); +} + +inline bool gk_msg_buf_ptrs_empty(uint32_t wrptr, uint32_t rdptr) { return (wrptr == rdptr); } + +inline bool gk_msg_buf_ptrs_full(uint32_t wrptr, uint32_t rdptr) { + uint32_t distance = wrptr >= rdptr ? wrptr - rdptr : wrptr + 2 * FVCC_BUF_SIZE - rdptr; + return !gk_msg_buf_ptrs_empty(wrptr, rdptr) && (distance >= FVCC_BUF_SIZE); +} + +inline bool gk_msg_buf_is_empty(const volatile ctrl_chan_msg_buf* msg_buf) { + return gk_msg_buf_ptrs_empty(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); +} + +inline bool gk_msg_buf_is_full(const volatile ctrl_chan_msg_buf* msg_buf) { + return gk_msg_buf_ptrs_full(msg_buf->wrptr.ptr, msg_buf->rdptr.ptr); +} + +inline bool gk_msg_valid(const volatile ctrl_chan_msg_buf* msg_buf) { + uint32_t rd_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + return msg_buf->msg_buf[rd_index].packet_header.routing.flags != 0; +} + +inline socket_handle_t* socket_receiver_available(packet_header_t* packet) { + socket_handle_t* handle = 0; + bool sender_found = false; + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { + handle = (socket_handle_t*)&socket_info->sockets[i]; + if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { + continue; + } + if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { + continue; + } + if (handle->sender_dev_id != routing_table[0].my_device_id) { + continue; + } + if (handle->sender_mesh_id != routing_table[0].my_mesh_id) { + continue; + } + if (handle->rcvr_dev_id != packet->routing.dst_dev_id) { + continue; + } + if (handle->rcvr_mesh_id != packet->routing.dst_mesh_id) { + continue; + } + sender_found = true; + break; + } + } + return sender_found ? handle : 0; +} +inline void set_socket_active(socket_handle_t* handle) { + handle->socket_state = SocketState::ACTIVE; + // send socket connection state to send socket opener. + noc_async_write_one_packet( + (uint32_t)(handle), handle->status_notification_addr, sizeof(socket_handle_t), noc_index); +} + +inline void socket_open(packet_header_t* packet) { + socket_handle_t* handle = 0; + if (packet->packet_parameters.socket_parameters.socket_direction == SOCKET_DIRECTION_SEND) { + handle = socket_receiver_available(packet); + if (handle) { + // If remote receive socket already opened, + // set send socket state to active and return. + handle->status_notification_addr = + ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; + set_socket_active(handle); + DPRINT << "GK: Receiver Available " << (uint32_t)handle->socket_id << ENDL(); + return; + } + } + + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == 0) { + // idle socket handle. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; + handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; + handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; + handle->socket_direction = packet->packet_parameters.socket_parameters.socket_direction; + handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; + handle->status_notification_addr = + ((uint64_t)packet->session.ack_offset_h << 32) | packet->session.ack_offset_l; + handle->socket_state = SocketState::OPENING; + + if (handle->socket_direction == SOCKET_DIRECTION_RECV) { + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->sender_dev_id = packet->routing.src_dev_id; + handle->sender_mesh_id = packet->routing.src_mesh_id; + handle->rcvr_dev_id = routing_table[0].my_device_id; + handle->rcvr_mesh_id = routing_table[0].my_mesh_id; + socket_info->socket_setup_pending++; + } else { + handle->sender_dev_id = routing_table[0].my_device_id; + handle->sender_mesh_id = routing_table[0].my_mesh_id; + handle->rcvr_dev_id = packet->routing.dst_dev_id; + handle->rcvr_mesh_id = packet->routing.dst_mesh_id; + } + DPRINT << "GK: Socket Opened : " << (uint32_t)handle->socket_id << ENDL(); + break; + } + } +} + +inline void socket_close(packet_header_t* packet) { + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + bool found = socket_info->sockets[i].socket_id == packet->packet_parameters.socket_parameters.socket_id && + socket_info->sockets[i].epoch_id == packet->packet_parameters.socket_parameters.epoch_id; + if (found) { + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + // handle->socket_id = 0; + // handle->epoch_id = 0; + // handle->socket_type = 0; + // handle->socket_direction = 0; + handle->pull_notification_adddr = 0; + handle->socket_state = SocketState::CLOSING; + socket_info->socket_setup_pending++; + break; + } + } +} + +inline void socket_open_for_connect(packet_header_t* packet) { + // open a send socket in response to a connect message received from a + // socket receiver. + // Local device has not yet opened a send socket, but we need to update + // local socket state. + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == 0) { + // idle socket handle. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + handle->socket_id = packet->packet_parameters.socket_parameters.socket_id; + handle->epoch_id = packet->packet_parameters.socket_parameters.epoch_id; + handle->socket_type = packet->packet_parameters.socket_parameters.socket_type; + handle->socket_direction = SOCKET_DIRECTION_SEND; + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->routing_plane = packet->packet_parameters.socket_parameters.routing_plane; + handle->socket_state = SocketState::OPENING; + handle->sender_dev_id = routing_table[0].my_device_id; + handle->sender_mesh_id = routing_table[0].my_mesh_id; + handle->rcvr_dev_id = packet->routing.src_dev_id; + handle->rcvr_mesh_id = packet->routing.src_mesh_id; + DPRINT << "GK: Opened Send Socket for Remote " << (uint32_t)handle->socket_id << ENDL(); + + break; + } + } +} + +inline void socket_connect(packet_header_t* packet) { + // connect messsage is initiated by a receiving socket. + // find a matching send socket in local state. + bool found_sender = false; + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_SEND) { + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + if (handle->socket_id != packet->packet_parameters.socket_parameters.socket_id) { + continue; + } + if (handle->epoch_id != packet->packet_parameters.socket_parameters.epoch_id) { + continue; + } + if (handle->sender_dev_id != packet->routing.dst_dev_id) { + continue; + } + if (handle->sender_mesh_id != packet->routing.dst_mesh_id) { + continue; + } + if (handle->rcvr_dev_id != packet->routing.src_dev_id) { + continue; + } + if (handle->rcvr_mesh_id != packet->routing.src_mesh_id) { + continue; + } + found_sender = true; + handle->pull_notification_adddr = + ((uint64_t)packet->session.target_offset_h << 32) | packet->session.target_offset_l; + handle->socket_state = SocketState::ACTIVE; + set_socket_active(handle); + DPRINT << "GK: Found Send Socket " << (uint32_t)handle->socket_id << ENDL(); + break; + } + } + + if (!found_sender) { + // No send socket opened yet. + // Device has not made it to the epoch where a send socket is opened. + // Log the receive socket connect request. + socket_open_for_connect(packet); + } +} + +uint32_t get_next_hop_router_noc_xy(packet_header_t* current_packet_header, uint32_t routing_plane) { + uint32_t dst_mesh_id = current_packet_header->routing.dst_mesh_id; + if (dst_mesh_id != routing_table->my_mesh_id) { + uint32_t next_port = routing_table[routing_plane].inter_mesh_table.dest_entry[dst_mesh_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } else { + uint32_t dst_device_id = current_packet_header->routing.dst_dev_id; + uint32_t next_port = routing_table[routing_plane].intra_mesh_table.dest_entry[dst_device_id]; + return eth_chan_to_noc_xy[noc_index][next_port]; + } +} + +inline bool send_gk_message(uint64_t dest_addr, packet_header_t* packet) { + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, wrptr); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + noc_addr, + NOC_UNICAST_WRITE_VC, + 1, + FVCC_BUF_LOG_SIZE, + false, + false, + (uint32_t)&socket_info->wrptr.ptr); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + + uint32_t wrptr = socket_info->wrptr.ptr; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + + noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); + noc_async_read_barrier(); + if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { + return false; + } + + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); + return true; +} + +inline bool retry_gk_message(uint64_t dest_addr, packet_header_t* packet) { + uint32_t wrptr = socket_info->wrptr.ptr; + uint64_t noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, rdptr); + + noc_async_read_one_packet(noc_addr, (uint32_t)(&socket_info->rdptr.ptr), 4); + noc_async_read_barrier(); + if (fvcc_buf_ptrs_full(wrptr, socket_info->rdptr.ptr)) { + return false; + } + + uint32_t dest_wr_index = wrptr & FVCC_SIZE_MASK; + noc_addr = dest_addr + offsetof(ctrl_chan_msg_buf, msg_buf) + dest_wr_index * sizeof(packet_header_t); + noc_async_write_one_packet((uint32_t)(packet), noc_addr, sizeof(packet_header_t), noc_index); + gk_message_pending = 0; + return true; +} + +inline void process_pending_socket() { + if (socket_info->socket_setup_pending == 0) { + // there is no pending socket setup work. + // or there is a pending message to be sent out + return; + } + + // Check write flush of previous gk message write. + // TODO. + + chan_request_entry_t* message = (chan_request_entry_t*)&socket_info->gk_message; + + if (gk_message_pending == 1) { + if (retry_gk_message(router_addr, &message->packet_header)) { + // decrement pending count. + // if there is more pending socket work, will pick up on next iteration. + socket_info->socket_setup_pending--; + } + } else { + for (uint32_t i = 0; i < MAX_SOCKETS; i++) { + if (socket_info->sockets[i].socket_state == SocketState::OPENING && + socket_info->sockets[i].socket_direction == SOCKET_DIRECTION_RECV) { + // send a message on fvcc to socket data sender. + socket_handle_t* handle = (socket_handle_t*)&socket_info->sockets[i]; + message->packet_header.routing.dst_dev_id = handle->sender_dev_id; + message->packet_header.routing.dst_mesh_id = handle->sender_mesh_id; + message->packet_header.routing.src_dev_id = handle->rcvr_dev_id; + message->packet_header.routing.src_mesh_id = handle->rcvr_mesh_id; + message->packet_header.routing.packet_size_bytes = PACKET_HEADER_SIZE_BYTES; + message->packet_header.routing.flags = SYNC; + message->packet_header.session.command = SOCKET_CONNECT; + message->packet_header.session.target_offset_h = handle->pull_notification_adddr >> 32; + message->packet_header.session.target_offset_l = (uint32_t)handle->pull_notification_adddr; + message->packet_header.packet_parameters.socket_parameters.socket_id = handle->socket_id; + message->packet_header.packet_parameters.socket_parameters.epoch_id = handle->epoch_id; + message->packet_header.packet_parameters.socket_parameters.socket_type = handle->socket_type; + message->packet_header.packet_parameters.socket_parameters.socket_direction = SOCKET_DIRECTION_RECV; + message->packet_header.packet_parameters.socket_parameters.routing_plane = handle->routing_plane; + tt_fabric_add_header_checksum((packet_header_t*)&message->packet_header); + + router_addr = + ((uint64_t)get_next_hop_router_noc_xy(&message->packet_header, handle->routing_plane) << 32) | + FVCC_OUT_BUF_START; + + if (send_gk_message(router_addr, &message->packet_header)) { + DPRINT << "GK: Sending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); + handle->socket_state = SocketState::ACTIVE; + socket_info->socket_setup_pending--; + } else { + DPRINT << "GK: Pending Connect to " << (uint32_t)handle->sender_dev_id << ENDL(); + // gatekeeper message buffer is full. need to retry on next iteration. + gk_message_pending = 1; + } + } + } + } +} + +inline void get_routing_tables() { + uint32_t temp_mask = router_mask; + uint32_t channel = 0; + uint32_t routing_plane = 0; + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0xF) { + temp_mask &= 0xF; + break; + } else { + temp_mask >>= 4; + } + channel += 4; + } + + if (temp_mask) { + for (uint32_t i = 0; i < 4; i++) { + if (temp_mask & 0x1) { + uint64_t router_config_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][channel] << 32) | + eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE; + noc_async_read_one_packet( + router_config_addr, + (uint32_t)&routing_table[routing_plane], + sizeof(tt::tt_fabric::fabric_router_l1_config_t)); + routing_plane++; + } + temp_mask >>= 1; + channel++; + } + } + gk_info->routing_planes = routing_plane; + noc_async_read_barrier(); +} + +void kernel_main() { + sync_val = get_arg_val(0); + router_mask = get_arg_val(1); + gk_message_pending = 0; + router_addr = 0; + + tt_fabric_init(); + + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 1, 0xbb000000); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 2, 0xAABBCCDD); + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX + 3, 0xDDCCBBAA); + + zero_l1_buf((tt_l1_ptr uint32_t*)&gk_info->gk_msg_buf, FVCC_BUF_SIZE_BYTES); + zero_l1_buf((tt_l1_ptr uint32_t*)socket_info, sizeof(socket_info_t)); + + sync_all_routers(); + get_routing_tables(); + uint64_t start_timestamp = get_timestamp(); + + uint32_t loop_count = 0; + uint32_t total_messages_procesed = 0; + volatile ctrl_chan_msg_buf* msg_buf = &gk_info->gk_msg_buf; + while (1) { + if (!gk_msg_buf_is_empty(msg_buf) && gk_msg_valid(msg_buf)) { + uint32_t msg_index = msg_buf->rdptr.ptr & FVCC_SIZE_MASK; + chan_request_entry_t* req = (chan_request_entry_t*)msg_buf->msg_buf + msg_index; + packet_header_t* packet = &req->packet_header; + if (tt_fabric_is_header_valid(packet)) { + total_messages_procesed++; + DPRINT << "GK: Message Received " << (uint32_t)packet->session.command + << " msg num = " << total_messages_procesed << ENDL(); + if (packet->routing.flags == SYNC) { + if (packet->session.command == SOCKET_OPEN) { + DPRINT << "GK: Socket Open " << ENDL(); + socket_open(packet); + } else if (packet->session.command == SOCKET_CLOSE) { + socket_close(packet); + } else if (packet->session.command == SOCKET_CONNECT) { + DPRINT << "GK: Socket Connect " << ENDL(); + socket_connect(packet); + } + } + + // clear the flags field to invalidate pull request slot. + // flags will be set to non-zero by next requestor. + gk_msg_buf_advance_rdptr((ctrl_chan_msg_buf*)msg_buf); + loop_count = 0; + } else { + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_BAD_HEADER); + return; + } + } + + loop_count++; + + process_pending_socket(); + + if (gk_info->router_sync.val == 0) { + // terminate signal from host sw. + if (loop_count >= 0x1000) { + // send terminate to all chip routers. + notify_all_routers(0); + break; + } + } + } + + DPRINT << "Gatekeeper messages processed " << total_messages_procesed << ENDL(); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); + + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); + + write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); +} diff --git a/tt_fabric/impl/kernels/tt_fabric_router.cpp b/tt_fabric/impl/kernels/tt_fabric_router.cpp index 847002383f2..efaa73cc58c 100644 --- a/tt_fabric/impl/kernels/tt_fabric_router.cpp +++ b/tt_fabric/impl/kernels/tt_fabric_router.cpp @@ -4,12 +4,16 @@ // clang-format off #include "dataflow_api.h" -#include "hw/inc/tt_fabric.h" +#include "tt_fabric/hw/inc/tt_fabric.h" // clang-format on +using namespace tt::tt_fabric; + router_state_t router_state __attribute__((aligned(16))); fvc_consumer_state_t fvc_consumer_state __attribute__((aligned(16))); // replicate for each fvc fvc_producer_state_t fvc_producer_state __attribute__((aligned(16))); // replicate for each fvc +fvcc_inbound_state_t fvcc_inbound_state __attribute__((aligned(16))); // inbound fabric virtual control channel +fvcc_outbound_state_t fvcc_outbound_state __attribute__((aligned(16))); // outbound fabric virtual control channel volatile local_pull_request_t local_pull_request_temp __attribute__((aligned(16))); // replicate for each fvc volatile local_pull_request_t* local_pull_request = &local_pull_request_temp; // replicate for each fvc @@ -17,11 +21,11 @@ constexpr uint32_t fvc_data_buf_size_words = get_compile_time_arg_val(0); constexpr uint32_t fvc_data_buf_size_bytes = fvc_data_buf_size_words * PACKET_WORD_SIZE_BYTES; constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(1); constexpr uint32_t kernel_status_buf_size_bytes = get_compile_time_arg_val(2); -// constexpr uint32_t sync_val = get_compile_time_arg_val(3); -// constexpr uint32_t router_mask = get_compile_time_arg_val(4); constexpr uint32_t timeout_cycles = get_compile_time_arg_val(3); uint32_t sync_val; uint32_t router_mask; +uint32_t gk_message_addr_l; +uint32_t gk_message_addr_h; constexpr uint32_t PACKET_QUEUE_STAUS_MASK = 0xabc00000; constexpr uint32_t PACKET_QUEUE_TEST_STARTED = PACKET_QUEUE_STAUS_MASK | 0x0; @@ -41,37 +45,29 @@ constexpr uint32_t PQ_TEST_MISC_INDEX = 16; tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast(kernel_status_buf_addr_arg); tt_l1_ptr volatile chan_req_buf* fvc_consumer_req_buf = reinterpret_cast(FABRIC_ROUTER_REQ_QUEUE_START); -tt_l1_ptr volatile tt::tt_fabric::fabric_router_l1_config_t* routing_table = - reinterpret_cast( - eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); +volatile tt_l1_ptr fabric_router_l1_config_t* routing_table = + reinterpret_cast(eth_l1_mem::address_map::FABRIC_ROUTER_CONFIG_BASE); uint64_t xy_local_addr; #define SWITCH_THRESHOLD 0x3FF -inline void notify_all_routers() { - uint32_t channel = 0; - uint32_t remaining_cores = router_mask; - // send semaphore increment to all fabric routers on this device. +inline void notify_gatekeeper() { + // send semaphore increment to gatekeeper on this device. // semaphore notifies all other routers that this router has completed // startup handshake with its ethernet peer. - for (uint32_t i = 0; i < 16; i++) { - if (remaining_cores == 0) { - break; - } - if (remaining_cores & (0x1 << i)) { - uint64_t dest_addr = ((uint64_t)eth_chan_to_noc_xy[noc_index][i] << 32) | FABRIC_ROUTER_SYNC_SEM; - noc_fast_atomic_increment( - noc_index, - NCRISC_AT_CMD_BUF, - dest_addr, - NOC_UNICAST_WRITE_VC, - 1, - 31, - false, - false, - MEM_NOC_ATOMIC_RET_VAL_ADDR); - } - } + uint64_t dest_addr = + (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, router_sync); + noc_fast_atomic_increment( + noc_index, + NCRISC_AT_CMD_BUF, + dest_addr, + NOC_UNICAST_WRITE_VC, + 1, + 31, + false, + false, + MEM_NOC_ATOMIC_RET_VAL_ADDR); + volatile uint32_t* sync_sem_addr = (volatile uint32_t*)FABRIC_ROUTER_SYNC_SEM; // wait for all device routers to have incremented the sync semaphore. // sync_val is equal to number of tt-fabric routers running on a device. @@ -84,8 +80,11 @@ inline void notify_all_routers() { void kernel_main() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; - sync_val = get_arg_val(0); - router_mask = get_arg_val(1); + uint32_t rt_args_idx = 0; + sync_val = get_arg_val(rt_args_idx++); + router_mask = get_arg_val(rt_args_idx++); + gk_message_addr_l = get_arg_val(rt_args_idx++); + gk_message_addr_h = get_arg_val(rt_args_idx++); tt_fabric_init(); @@ -99,6 +98,8 @@ void kernel_main() { router_state.sync_out = 0; zero_l1_buf((tt_l1_ptr uint32_t*)fvc_consumer_req_buf, sizeof(chan_req_buf)); + zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_IN_BUF_START, FVCC_IN_BUF_SIZE); + zero_l1_buf((tt_l1_ptr uint32_t*)FVCC_OUT_BUF_START, FVCC_OUT_BUF_SIZE); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX, (uint32_t)&router_state); write_kernel_status(kernel_status, PQ_TEST_WORD_CNT_INDEX + 1, (uint32_t)&fvc_consumer_state); write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX + 1, (uint32_t)&fvc_producer_state); @@ -110,12 +111,19 @@ void kernel_main() { fvc_data_buf_size_words / 2, (uint32_t)&fvc_consumer_state.remote_rdptr); + fvcc_outbound_state.init( + FVCC_OUT_BUF_START, FVCC_SYNC_BUF_START, FVCC_IN_BUF_START, (uint32_t)&fvcc_inbound_state.inbound_wrptr); + fvcc_inbound_state.init( + FVCC_IN_BUF_START, + (uint32_t)&fvcc_outbound_state.remote_rdptr, + (((uint64_t)gk_message_addr_h << 32) | gk_message_addr_l) + offsetof(gatekeeper_info_t, gk_msg_buf)); + if (!wait_all_src_dest_ready(&router_state, timeout_cycles)) { write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } - notify_all_routers(); + notify_gatekeeper(); uint64_t start_timestamp = get_timestamp(); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000001); @@ -195,6 +203,9 @@ void kernel_main() { return; } + fvcc_inbound_state.fvcc_handler(); + fvcc_outbound_state.fvcc_handler(); + loop_count++; // need to optimize this. diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 3ce95f1beba..d345637891c 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -33,7 +33,7 @@ struct address_map { static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = 0; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2056; + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2064; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = 0; diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index faa1814d985..e28c477a8a2 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -63,7 +63,7 @@ struct address_map { static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2056; + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2064; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = (FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_SIZE + 31) & ~31; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE;