Skip to content

Commit

Permalink
Merge branch 'main' into dimitri/17040-single-release-image
Browse files Browse the repository at this point in the history
  • Loading branch information
dimitri-tenstorrent authored Jan 23, 2025
2 parents 5f6cbfc + fb9f208 commit 0de3350
Show file tree
Hide file tree
Showing 25 changed files with 1,414 additions and 157 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ endif()
############################################
# Project setup
############################################
include(cmake/compilers.cmake)

list(PREPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
include(version)
Expand Down Expand Up @@ -79,6 +78,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
endif()
endif()

include(compilers)
CHECK_COMPILERS()

if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE IN_LIST CMAKE_CONFIGURATION_TYPES)
Expand All @@ -91,6 +91,9 @@ set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -DDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DDEBUG")
set(CMAKE_CXX_FLAGS_CI "-O3 -DDEBUG")

# We're not currently using C++20 modules, so don't bother scanning for them
set(CMAKE_CXX_SCAN_FOR_MODULES FALSE)

############################################################################################################################
# Project Options
# The following options and their defaults impact what artifacts get built
Expand Down
2 changes: 1 addition & 1 deletion models/demos/bert_tiny/tests/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_perf_device_bare_metal(batch_size, expected_perf):
margin = 0.03

if is_wormhole_b0():
expected_perf = 3990.0
expected_perf = 4114.8
else:
expected_perf = 3460.0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1116,3 +1116,32 @@ def test_transpose_16411(device):
assert_with_pcc(p_c2, ttnn.to_torch(c2), 0.9999)
assert_with_pcc(p_c3, ttnn.to_torch(c3), 0.9999)
assert_with_pcc(p_c4, ttnn.to_torch(c4), 0.9999)


@pytest.mark.parametrize("rank", [5])
@pytest.mark.parametrize("indices", [[0, 1], [0, 2], [0, 3], [0, 4], [1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4]])
@pytest.mark.parametrize("layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT])
def test_transpose_high_rank(*, device: ttnn.Device, rank: int, indices, layout):
torch.manual_seed(2005)
ttnn.disable_and_clear_program_cache(device)
ttnn.enable_program_cache(device)

shape = [2] * rank

a = torch.randn(shape, dtype=torch.bfloat16)
b = torch.randn(shape, dtype=torch.bfloat16)

tt_a = ttnn.from_torch(a, device=device, layout=layout)
tt_b = ttnn.from_torch(b, device=device, layout=layout)

a = a.transpose(*indices)
b = b.transpose(*indices)

tt_a = ttnn.transpose(tt_a, *indices)
tt_b = ttnn.transpose(tt_b, *indices)

output_a = ttnn.to_torch(tt_a)
output_b = ttnn.to_torch(tt_b)

assert torch.allclose(a, output_a)
assert torch.allclose(b, output_b)
124 changes: 117 additions & 7 deletions tests/ttnn/unit_tests/operations/test_permute.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

import torch
import math

import ttnn
import itertools
Expand Down Expand Up @@ -182,13 +183,13 @@ def generate_permutations(N):
yield perm


@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
@pytest.mark.parametrize("shape", [(7, 7, 7, 7, 7)])
@pytest.mark.parametrize("perm", generate_permutations(5))
@pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
def test_permute_5d_width(shape, perm, memory_config, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
input_a = torch.randn(shape)
torch_output = torch.permute(input_a, perm)
Expand All @@ -202,13 +203,13 @@ def test_permute_5d_width(shape, perm, memory_config, dtype, device):
assert_with_pcc(torch_output, tt_output, 0.9999)


@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
@pytest.mark.parametrize("shape", [(3, 65, 3, 3, 65), (1, 6, 256, 20, 50), (6, 20, 50, 1, 256)])
@pytest.mark.parametrize("perm", [(4, 0, 3, 2, 1), (1, 3, 4, 0, 2), (3, 0, 4, 1, 2)])
@pytest.mark.parametrize("memory_config", [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
def test_permute_5d_blocked(shape, perm, memory_config, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(520)
input_a = torch.randn(shape)

Expand All @@ -224,8 +225,6 @@ def test_permute_5d_blocked(shape, perm, memory_config, dtype, device):
assert_with_pcc(torch_output, tt_output, 0.9999)


@skip_for_blackhole("tilize_block gives bad pcc after second iteration")
@skip_for_grayskull("tilize_block gives bad pcc after second iteration")
def test_permute_nd(device):
torch.manual_seed(2005)
torch_tensor = torch.rand((1, 3, 16, 16, 16, 16), dtype=torch.bfloat16)
Expand Down Expand Up @@ -400,7 +399,6 @@ def test_permutations_5d_fixed_w(shape, perm, dtype, device):
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.skip("#16575 to_layout from tiled to RM fails on reshape")
@pytest.mark.parametrize("shape", [[1, 9, 91, 7, 9]])
@pytest.mark.parametrize("perm", [[0, 3, 4, 1, 2]])
def test_permute_adversarial(shape, perm, device):
Expand All @@ -427,3 +425,115 @@ def test_permute_4d_fixed_w(shape, perm, device):
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


def generate_fixed_no_dim0_dim1_transpose_permutations(N, dim0, dim1):
perms_Nd = generate_permutations(N)
for perm in perms_Nd:
if perm[dim0] != dim1:
yield perm


@pytest.mark.parametrize("shape", [[7, 7, 7, 17, 17]])
@pytest.mark.parametrize("perm", [[0, 1, 4, 3, 2]])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
@pytest.mark.parametrize("pad_value", [35.0, float("-inf"), None])
def test_permute_5d_yw_padded(shape, perm, dtype, pad_value, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
ttnn_output = ttnn.permute(input_tensor, perm, pad_value=pad_value)
output_tensor = ttnn.to_torch(ttnn_output)
torch_output = torch.permute(torch_tensor, perm)

assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)

if pad_value != None:
logical_shape = torch_output.shape
output_padded = ttnn.from_device(ttnn_output).to_torch()
padded_shape = output_padded.shape
num_padded_values = torch.prod(torch.tensor(padded_shape)) - torch.prod(torch.tensor(logical_shape))
assert torch.sum(output_padded == pad_value) == num_padded_values


@pytest.mark.parametrize("shape", [[33, 1, 17, 33, 33]])
@pytest.mark.parametrize("perm", generate_fixed_no_dim0_dim1_transpose_permutations(5, 4, 3))
@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
def test_permute_5d_yw_permutations(shape, perm, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
output_tensor = ttnn.permute(input_tensor, perm)
output_tensor = ttnn.to_torch(output_tensor)
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
@pytest.mark.parametrize("perm", [[0, 3, 2, 1], [3, 1, 2, 0], [1, 3, 2, 0], [3, 0, 2, 1]])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
def test_permute_4d_yw_permutations(shape, perm, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
output_tensor = ttnn.permute(input_tensor, perm)
output_tensor = ttnn.to_torch(output_tensor)
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
@pytest.mark.parametrize("perm", [[2, 3, 0, 1], [3, 2, 1, 0], [2, 3, 1, 0], [3, 2, 0, 1]])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
def test_permute_4d_whyx_permutations(shape, perm, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
output_tensor = ttnn.permute(input_tensor, perm)
output_tensor = ttnn.to_torch(output_tensor)
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.parametrize("shape", [[1, 1, 32, 32], [1, 1, 128, 128], [32, 32, 32, 32], [96, 96, 96, 96]])
@pytest.mark.parametrize("perm", [[0, 2, 3, 1], [0, 3, 1, 2], [1, 2, 3, 0], [2, 1, 3, 0], [2, 0, 3, 1]])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16])
def test_permute_4d_other_permutations(shape, perm, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
output_tensor = ttnn.permute(input_tensor, perm)
output_tensor = ttnn.to_torch(output_tensor)
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)


@pytest.mark.parametrize("shape", [[33, 1, 17, 33, 33]])
@pytest.mark.parametrize("perm", [[0, 1, 4, 2, 3], [0, 4, 1, 2, 3], [2, 4, 1, 0, 3], [4, 2, 1, 0, 3]])
@pytest.mark.parametrize("dtype", [ttnn.bfloat16, ttnn.float32])
def test_permute_5d_wyh(shape, perm, dtype, device):
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Grayskull doesn't support float32")
torch.manual_seed(2005)
torch_tensor = torch.rand(shape, dtype=torch.bfloat16)
input_tensor = ttnn.from_torch(torch_tensor, layout=ttnn.TILE_LAYOUT, dtype=dtype, device=device)
output_tensor = ttnn.permute(input_tensor, perm, pad_value=0.0)
output_tensor = ttnn.to_torch(output_tensor)
torch_output = torch.permute(torch_tensor, perm)
assert torch_output.shape == output_tensor.shape
assert_with_pcc(torch_output, output_tensor, 0.9999)
2 changes: 1 addition & 1 deletion tt-train/cmake/dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ CPMAddPackage(NAME magic_enum GITHUB_REPOSITORY Neargye/magic_enum GIT_TAG v0.9.
# nlohmann/json : https://github.com/nlohmann/json
############################################################################################################################

CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.9.1 OPTIONS "JSON_BuildTests OFF")
CPMAddPackage(NAME json GITHUB_REPOSITORY nlohmann/json GIT_TAG v3.11.3 OPTIONS "JSON_BuildTests OFF")

CPMAddPackage(NAME xtl GITHUB_REPOSITORY xtensor-stack/xtl GIT_TAG 0.7.7 OPTIONS "XTL_ENABLE_TESTS OFF")

Expand Down
2 changes: 1 addition & 1 deletion tt-train/sources/examples/linear_regression_ddp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ int main() {
optimizer.zero_grad();
auto output = (*model)(data);
auto loss = ttml::ops::mse_loss(output, targets);
fmt::print("Loss shape: {}\n", loss->get_value().shape());
fmt::print("Loss shape: {}\n", loss->get_value().get_shape());
auto mesh_shape = device->shape();
ttml::core::MeshToXTensorVariant<float> identity_composer =
ttml::core::VectorMeshToXTensor<float>(mesh_shape);
Expand Down
2 changes: 1 addition & 1 deletion tt-train/sources/examples/sample_app/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void print_tensor(const tt::tt_metal::Tensor& tensor) {
// but we are using TILE layout. The printed format WILL NOT be correct. But good enough for a demo

// Get the shape of the tensor
auto shape = tensor.shape();
auto shape = tensor.get_shape();
// compyte the size of the tensor
size_t size = 1;
for (size_t i = 0; i < shape.size(); i++) size *= shape[i];
Expand Down
10 changes: 10 additions & 0 deletions tt-train/sources/ttml/autograd/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ void Tensor::set_value(const tt::tt_metal::Tensor& value) {
}

void Tensor::set_grad(const tt::tt_metal::Tensor& grad) {
if (core::is_tensor_initialized(grad)) {
auto grad_shape = grad.get_shape();
auto value_shape = m_value.get_tensor().get_shape();
if (grad_shape != value_shape) {
throw std::logic_error(fmt::format(
"Shapes of gradients are not equal. Expected: {}, got: {}",
m_value.get_tensor().get_shape(),
grad.get_shape()));
}
}
m_grad = grad;
}

Expand Down
4 changes: 2 additions & 2 deletions tt-train/sources/ttml/models/gpt2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ void weights_initialization(Transformer& model) {
for (auto& [name, tensor_ptr] : params) {
const auto& tensor = tensor_ptr->get_value();
if (name.find("weight") != std::string::npos) {
init::normal_init(tensor_ptr, tensor.shape(), {0.F, 0.02F});
init::normal_init(tensor_ptr, tensor.get_shape(), {0.F, 0.02F});
} else if (name.find("bias") != std::string::npos) {
init::constant_init(tensor_ptr, tensor.shape(), 0.F);
init::constant_init(tensor_ptr, tensor.get_shape(), 0.F);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions tt-train/sources/ttml/ops/layernorm_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ autograd::TensorPtr composite_layernorm(
auto tensor_shape = tensor->get_value().get_shape();

auto shape = core::create_shape({tensor_shape[0], tensor_shape[1], tensor_shape[2], 1});
auto mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().dtype());
auto mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().get_dtype());
ttnn::moreh_mean(
tensor->get_value(),
3, // last dimension
Expand Down Expand Up @@ -139,7 +139,7 @@ autograd::TensorPtr composite_layernorm(
// dnorm.mean(-1, keepdim=True)
auto dnorm_shape = dtensor_normalized.get_shape();
auto shape = core::create_shape({dnorm_shape[0], dnorm_shape[1], dnorm_shape[2], 1});
auto dnorm_mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().dtype());
auto dnorm_mean = core::zeros(shape, &autograd::ctx().get_device(), tensor->get_value().get_dtype());
ttnn::moreh_mean(
dtensor_normalized,
/* dim */ 3,
Expand Down
4 changes: 2 additions & 2 deletions tt-train/sources/ttml/ops/losses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ autograd::TensorPtr nll_loss(
auto* device = &autograd::ctx().get_device();
auto divisor = core::empty(ttnn::Shape({1, 1}, {32, 32}), device, prediction->get_value().memory_config());

auto tensor_shape = prediction->get_value().shape();
auto tensor_shape = prediction->get_value().get_shape();
uint32_t Ndim = tensor_shape[0] * tensor_shape[1] * tensor_shape[2];
uint32_t Cdim = tensor_shape[3];
auto reshaped_tensor = ttnn::reshape(prediction->get_value(), ttnn::Shape({Ndim, Cdim}));
Expand Down Expand Up @@ -98,7 +98,7 @@ autograd::TensorPtr nll_loss(
/* ignore_index */ -100,
/* memory_config */ std::nullopt,
/* compute_kernel_config */ core::ComputeKernelConfig::precise());
grad = ttnn::reshape(grad, prediction->get_value().shape());
grad = ttnn::reshape(grad, prediction->get_value().get_shape());
prediction->add_grad(grad);
};
auto links = autograd::get_links(prediction);
Expand Down
2 changes: 1 addition & 1 deletion tt-train/sources/ttml/ops/unary_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ autograd::TensorPtr mean(const autograd::TensorPtr& tensor) {
}

autograd::TensorPtr broadcast_batch(const autograd::TensorPtr& tensor, uint32_t new_batch_dim) {
if (new_batch_dim == 1 || tensor->get_value().shape()[0] == new_batch_dim) {
if (new_batch_dim == 1 || tensor->get_value().get_shape()[0] == new_batch_dim) {
return tensor;
}
auto out = ttml::autograd::create_tensor();
Expand Down
23 changes: 11 additions & 12 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,17 +397,24 @@ void process_exec_buf_end_d(uint32_t& block_noc_writes_to_clear, uint32_t block_

// Note that for non-paged writes, the number of writes per page is always 1
// This means each noc_write frees up a page
template <bool multicast>
void process_write_linear(
uint32_t num_mcast_dests, uint32_t& block_noc_writes_to_clear, uint32_t block_next_start_addr[]) {
volatile tt_l1_ptr CQDispatchCmd* cmd = (volatile tt_l1_ptr CQDispatchCmd*)cmd_ptr;
bool multicast = num_mcast_dests > 0;
if (not multicast) {
num_mcast_dests = 1;
}

uint32_t dst_noc = cmd->write_linear.noc_xy_addr;
uint32_t write_offset_index = cmd->write_linear.write_offset_index;
uint32_t dst_addr = cmd->write_linear.addr + write_offset[write_offset_index];
uint32_t length = cmd->write_linear.length;
uint32_t data_ptr = cmd_ptr + sizeof(CQDispatchCmd);
cq_noc_async_write_init_state<CQ_NOC_sNdl, multicast>(0, get_noc_addr_helper(dst_noc, dst_addr));
if (multicast) {
cq_noc_async_write_init_state<CQ_NOC_sNdl, true>(0, get_noc_addr_helper(dst_noc, dst_addr));
} else {
cq_noc_async_write_init_state<CQ_NOC_sNdl, false>(0, get_noc_addr_helper(dst_noc, dst_addr));
}

while (length != 0) {
// More data needs to be written, but we've exhausted the CB. Acquire more pages.
Expand All @@ -434,11 +441,7 @@ void process_write_linear(
uint32_t available_data = cb_fence - data_ptr;
uint32_t xfer_size = length > available_data ? available_data : length;

if constexpr (multicast) {
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests);
} else {
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size);
}
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests);
// Increment counters based on the number of packets that were written
uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE);
noc_nonposted_writes_num_issued[noc_index] += num_noc_packets_written;
Expand All @@ -454,11 +457,7 @@ void process_write_linear(
void process_write(uint32_t& block_noc_writes_to_clear, uint32_t block_next_start_addr[]) {
volatile tt_l1_ptr CQDispatchCmd* cmd = (volatile tt_l1_ptr CQDispatchCmd*)cmd_ptr;
uint32_t num_mcast_dests = cmd->write_linear.num_mcast_dests;
if (num_mcast_dests == 0) {
process_write_linear<false>(1, block_noc_writes_to_clear, block_next_start_addr);
} else {
process_write_linear<true>(num_mcast_dests, block_noc_writes_to_clear, block_next_start_addr);
}
process_write_linear(num_mcast_dests, block_noc_writes_to_clear, block_next_start_addr);
}

template <bool is_dram>
Expand Down
Loading

0 comments on commit 0de3350

Please sign in to comment.