Draft IRON implementation for init values

Xilinx · Oct 31, 2024 · acf2e52 · acf2e52
1 parent d4db482
commit acf2e52
Show file tree

Hide file tree

Showing 4 changed files with 285 additions and 0 deletions.
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
@@ -379,6 +379,7 @@ def __init__(
         via_DMA=None,
         plio=None,
         disable_synchronization=None,
+        initValues=None,
     ):
         self.datatype = try_convert_np_type_to_mlir_type(datatype)
         if not isinstance(consumerTiles, List):
@@ -387,6 +388,8 @@ def __init__(
             dimensionsFromStreamPerConsumer = []
         if dimensionsToStream is None:
             dimensionsToStream = []
+        if initValues is None:
+            initValues = []
         of_Ty = TypeAttr.get(ObjectFifoType.get(self.datatype))
         super().__init__(
             sym_name=name,
@@ -399,6 +402,7 @@ def __init__(
             via_DMA=via_DMA,
             plio=plio,
             disable_synchronization=disable_synchronization,
+            initValues=initValues,
         )
 
     def acquire(self, port, num_elem):

diff --git a/test/npu-xrt/memtile_repeat/init_values_repeat/aie2.py b/test/npu-xrt/memtile_repeat/init_values_repeat/aie2.py
@@ -0,0 +1,77 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai, valid_xchess_license
+#
+# RUN: %python %S/aie2.py 4 > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe -x final.xclbin -i insts.txt -k MLIR_AIE -l 4 | FileCheck %s
+# CHECK: PASS!
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+
+N = 2
+depth = 2
+dev = AIEDevice.npu1_1col
+col = 0
+#memtile_repeat_count = 2
+
+if len(sys.argv) > 1:
+    N = int(sys.argv[1])
+data_out_size = N * depth #* (memtile_repeat_count + 1)
+
+if len(sys.argv) > 2:
+    if sys.argv[2] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[2] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[2]))
+
+if len(sys.argv) > 3:
+    col = int(sys.argv[3])
+
+
+def init_repeat():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
+            tensor_out_ty = np.ndarray[(data_out_size,), np.dtype[np.int32]]
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            MemTile = tile(col, 1)
+            ComputeTile = tile(col, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", MemTile, ComputeTile, 2, tensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)
+            #of_in.set_memtile_repeat(memtile_repeat_count)
+            object_fifo_link(of_in, of_out)
+
+            # To/from AIE-array data movement
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_out_ty)
+            def sequence(A, B, C):
+                npu_dma_memcpy_nd(
+                    metadata=of_out,
+                    bd_id=0,
+                    mem=C,
+                    sizes=[1, 1, 1, data_out_size],
+                )
+                dma_wait(of_out)
+
+    print(ctx.module)
+
+
+init_repeat()
diff --git a/test/npu-xrt/memtile_repeat/init_values_repeat/test.cpp b/test/npu-xrt/memtile_repeat/init_values_repeat/test.cpp
@@ -0,0 +1,195 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6")(
+      "length,l", po::value<int>()->default_value(4096),
+      "the length of the transfer in int32_t")(
+      "repeat,r", po::value<int>()->default_value(3),
+      "the memtile repeat count");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << std::endl;
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << std::endl;
+    return 1;
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << std::endl;
+
+  int N = vm["length"].as<int>();
+  int repeat_count = vm["repeat"].as<int>();
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>()
+              << std::endl;
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>()
+              << std::endl;
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context." << std::endl;
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << std::endl;
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY,
+                        kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY,
+                        kernel.group_id(4));
+  auto bo_out = xrt::bo(device, N /** (repeat_count + 1)*/ * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects." << std::endl;
+
+  // int32_t *bufInA = bo_inA.map<int32_t *>();
+  // std::vector<uint32_t> srcVecA;
+  // for (int i = 0; i < N; i++)
+  //   srcVecA.push_back(i + 1);
+  // memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel." << std::endl;
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  run.wait();
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+  for (uint32_t i = 0; i < N /** (repeat_count + 1)*/; i++) {
+    uint32_t ref = i + 1;
+    if (*(bufOut + i) != ref) {
+      std::cout << "error at index[" << i << "]: expected " << ref << " got "
+                << *(bufOut + i) << std::endl;
+      errors++;
+    }
+  }
+
+  if (!errors) {
+    std::cout << std::endl << "PASS!" << std::endl << std::endl;
+    return 0;
+  } else {
+    std::cout << std::endl
+              << errors << " mismatches." << std::endl
+              << std::endl;
+    std::cout << std::endl << "fail." << std::endl << std::endl;
+    return 1;
+  }
+}
diff --git a/test/python/objFifo.py b/test/python/objFifo.py
@@ -55,6 +55,15 @@ def objFifo_example():
         of2 = object_fifo("of2", T_, C_, 2, np.ndarray[(256,), np.dtype[np.int32]])
         of2.set_via_shared_mem(ObjectFifoPort.Consume)
 
+        of3 = object_fifo(
+            "of3",
+            M,
+            C_,
+            2,
+            np.ndarray[(2,), np.dtype[np.int32]],
+            initValues = [],
+        )
+
         C = Core(T_)
         bb = Block.create_at_start(C.body)
         with InsertionPoint(bb):