diff --git a/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe b/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe new file mode 100755 index 0000000000..d39f703905 Binary files /dev/null and b/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe differ diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp index 9c11596119..63248d0959 100644 --- a/programming_examples/basic/passthrough_dmas/test.cpp +++ b/programming_examples/basic/passthrough_dmas/test.cpp @@ -166,8 +166,22 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel." << std::endl; unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); + + std::ofstream f_time; + f_time.open("time.txt"); + for (int i=1; i<=1000; i++) { + auto start = std::chrono::high_resolution_clock::now(); + + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + if (i<11) + std::cout << i << " " << srcVecA.size() << " NPU time: " << npu_time << "us." << std::endl; + f_time << npu_time << "\n"; + } + f_time.close(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/passthrough_dmas/time.txt b/programming_examples/basic/passthrough_dmas/time.txt new file mode 100644 index 0000000000..22b154f9ae --- /dev/null +++ b/programming_examples/basic/passthrough_dmas/time.txt @@ -0,0 +1,1000 @@ +2148 +331 +335 +253 +227 +154 +148 +148 +150 +150 +139 +128 +121 +127 +123 +123 +123 +123 +125 +131 +130 +131 +130 +131 +157 +136 +159 +163 +169 +164 +164 +162 +143 +133 +120 +120 +121 +120 +120 +117 +128 +127 +127 +126 +126 +112 +126 +113 +126 +126 +133 +126 +125 +130 +126 +124 +127 +127 +125 +108 +126 +128 +127 +112 +115 +109 +114 +114 +113 +115 +115 +115 +106 +121 +121 +121 +120 +119 +120 +119 +111 +122 +122 +121 +121 +122 +122 +122 +108 +188 +209 +152 +122 +121 +122 +113 +106 +109 +110 +109 +110 +114 +110 +111 +111 +122 +122 +122 +121 +119 +120 +120 +108 +121 +122 +120 +120 +120 +120 +120 +120 +118 +198 +179 +116 +121 +120 +108 +147 +199 +179 +106 +106 +106 +106 +105 +121 +122 +120 +119 +119 +120 +119 +119 +120 +120 +119 +119 +119 +119 +119 +111 +125 +121 +120 +118 +125 +121 +121 +119 +112 +120 +120 +119 +119 +119 +119 +119 +107 +120 +121 +121 +121 +120 +119 +119 +123 +110 +119 +120 +120 +119 +119 +119 +119 +105 +105 +105 +106 +108 +114 +105 +105 +105 +105 +121 +121 +121 +120 +118 +121 +120 +120 +124 +196 +191 +120 +120 +120 +108 +104 +105 +105 +105 +105 +105 +105 +104 +106 +107 +121 +122 +118 +102 +119 +120 +119 +117 +110 +120 +120 +120 +119 +120 +121 +119 +155 +199 +191 +119 +119 +119 +107 +121 +121 +121 +120 +119 +121 +120 +121 +113 +120 +119 +124 +102 +121 +120 +120 +106 +105 +106 +106 +108 +104 +104 +106 +105 +104 +120 +120 +120 +120 +118 +119 +119 +119 +112 +119 +119 +120 +119 +119 +120 +119 +107 +121 +121 +119 +114 +121 +120 +119 +123 +110 +121 +120 +119 +119 +118 +119 +103 +194 +203 +183 +114 +110 +110 +122 +120 +119 +119 +119 +119 +119 +119 +118 +111 +109 +113 +110 +103 +120 +119 +119 +119 +119 +121 +119 +119 +119 +120 +119 +119 +105 +121 +120 +121 +121 +119 +119 +119 +112 +117 +198 +190 +120 +120 +120 +108 +105 +105 +106 +104 +98 +110 +114 +108 +122 +120 +121 +121 +119 +121 +120 +121 +119 +112 +120 +120 +119 +119 +119 +120 +119 +108 +105 +105 +104 +104 +105 +105 +104 +105 +105 +121 +121 +120 +124 +119 +121 +103 +121 +117 +121 +120 +119 +120 +120 +119 +119 +108 +122 +121 +121 +120 +119 +124 +119 +120 +135 +198 +191 +120 +120 +121 +108 +105 +106 +104 +104 +99 +110 +110 +109 +119 +121 +121 +121 +119 +119 +121 +120 +120 +113 +110 +110 +119 +119 +119 +119 +119 +119 +119 +119 +119 +119 +119 +119 +119 +111 +121 +121 +120 +124 +123 +121 +122 +124 +111 +110 +110 +110 +106 +105 +110 +110 +110 +120 +119 +119 +119 +119 +119 +119 +119 +107 +121 +121 +120 +120 +121 +122 +121 +120 +129 +199 +173 +121 +119 +104 +108 +165 +200 +180 +107 +106 +106 +105 +124 +121 +121 +120 +119 +119 +119 +119 +113 +119 +123 +120 +120 +119 +119 +119 +106 +121 +121 +119 +125 +120 +121 +121 +119 +121 +120 +120 +120 +120 +118 +119 +119 +105 +106 +105 +106 +105 +105 +104 +104 +105 +105 +136 +198 +192 +121 +121 +121 +108 +129 +199 +196 +106 +105 +104 +105 +105 +105 +104 +103 +110 +106 +105 +105 +105 +105 +110 +197 +191 +121 +121 +121 +119 +119 +119 +122 +119 +119 +120 +120 +121 +122 +119 +119 +128 +120 +128 +121 +100 +97 +98 +97 +113 +114 +114 +116 +114 +109 +120 +124 +127 +121 +123 +121 +123 +121 +117 +114 +114 +113 +114 +114 +111 +115 +99 +105 +122 +122 +119 +107 +120 +121 +122 +112 +123 +122 +122 +122 +123 +126 +122 +117 +122 +123 +122 +121 +120 +121 +103 +118 +114 +114 +117 +115 +114 +115 +114 +115 +114 +125 +199 +172 +114 +114 +114 +118 +120 +122 +122 +121 +120 +120 +126 +121 +176 +198 +187 +122 +122 +122 +121 +121 +121 +120 +120 +120 +120 +123 +117 +143 +199 +183 +122 +121 +121 +117 +114 +113 +114 +113 +111 +110 +114 +111 +116 +199 +198 +130 +122 +121 +121 +110 +110 +110 +111 +110 +111 +110 +110 +109 +110 +122 +121 +125 +121 +121 +120 +120 +112 +123 +122 +123 +122 +121 +122 +121 +113 +122 +122 +122 +121 +120 +121 +121 +113 +157 +199 +193 +122 +122 +122 +113 +110 +114 +110 +109 +111 +110 +110 +109 +110 +123 +122 +122 +121 +120 +122 +122 +116 +122 +122 +122 +121 +120 +125 +120 +112 +156 +199 +193 +122 +122 +121 +112 +109 +110 +109 +112 +114 +112 +115 +114 +115 +199 +198 +127 +122 +123 +121 +109 +114 +110 +111 +110 +110 +110 +110 +110 +121 +121 +122 +121 +120 +122 +122 +122 +112 +123 +122 +119 +102 +121 +121 +121 +117 +118 +114 +114 +114 +109 +110 +115 +111 +111 +122 +121 +122 +121 +120 +120 +120 +112 +123 +122 +121 +121 +122 +122 +122 +112 +162 +199 +181 +122 +120 +103 +118 +115 +114 +113 +113 +109 +110 +110 +110 +110 +123 +122 +122 +121 +120 +120 +121 +112 +155 +199 +197 +122 +122 +121 +111 +110 +109 +110 +148 +156 +161 +161 +169 +171 +169 +168 +172 +152 +144 +127 +116 +117 +114 +115 +113 +115 +124 +123 +123 +122 +123 +124 +123 +112 +122 +121 +121 +110 +122 +121 +120 +112 +164 +200 +193 +123 +123 +122 +113 +110 +111 +115 +110 +109 +110 +110 +110 +110 +122 +122 +122 +121 +120 +122 +121 +117 +122 +122 +121 +123 +121 +121 +121 +112 +122 +122 +122 +121 +120 +122 +121 +120 +177 +197 +193 +122 +122 +122 +112 +110 +110 +110 +110 +109 +110 +110 +110 +110 +123 +121 +120 +108 +126 +120 +121 +116 +123 +123 +122 +121 +121 +123 +121 +117 +122 +122 +122 +121 +120 +121 +121 +120 +114 +114 +123 +122 +123 +122 +121 +123 +114 +115 +115 +113 diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index 11f2824a42..07e9a1d376 100644 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -14,7 +14,9 @@ include ${srcdir}/../../makefile-common targetname = passThroughKernel VPATH := ${srcdir}/../../../aie_kernels/generic -data_size = 4096 +data_size = 20480 + + trace_size = 8192 PASSTHROUGH_SIZE = ${data_size} diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp index f28691abcd..127149b84a 100644 --- a/programming_examples/basic/passthrough_kernel/test.cpp +++ b/programming_examples/basic/passthrough_kernel/test.cpp @@ -88,8 +88,22 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out); - run.wait(); + + std::ofstream f_time; + f_time.open("time.txt"); + for (int i=1; i<=1000; i++) { + auto start = std::chrono::high_resolution_clock::now(); + + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out); + run.wait(); + + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + if (i<11) + std::cout << i << " " << PASSTHROUGH_SIZE << " NPU time: " << npu_time << "us." << std::endl; + f_time << npu_time << "\n"; + } + f_time.close(); // Sync device to host memories bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 814f8c7a6a..5751a26a6c 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -81,8 +81,22 @@ def main(opts): if opts.verbosity >= 1: print("Running Kernel.") opcode = 3 - h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1) - h.wait() + + with open("time_py.txt", "w") as f: + + for i in range(1000): + start = time.time_ns() + + h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1) + h.wait() + + stop = time.time_ns() + npu_time_ms = round((stop - start)/1000) + f.write(f"{npu_time_ms}\n") + print(f"{i}: {opts.size} - {npu_time_ms}") + + + bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) # Copy output results and verify they are correct diff --git a/programming_examples/basic/passthrough_kernel/time.txt b/programming_examples/basic/passthrough_kernel/time.txt new file mode 100644 index 0000000000..e2f3556454 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/time.txt @@ -0,0 +1,1000 @@ +353 +242 +207 +176 +149 +147 +149 +146 +132 +132 +131 +135 +136 +118 +134 +135 +135 +135 +134 +135 +136 +135 +134 +134 +135 +135 +134 +134 +136 +135 +134 +133 +133 +131 +128 +128 +128 +128 +131 +128 +128 +131 +137 +124 +137 +132 +132 +132 +132 +132 +138 +132 +131 +136 +136 +134 +133 +135 +135 +135 +133 +133 +136 +134 +134 +134 +134 +133 +132 +134 +132 +127 +124 +122 +132 +126 +125 +125 +125 +124 +126 +126 +126 +124 +125 +126 +125 +125 +125 +125 +125 +124 +124 +125 +124 +125 +125 +125 +125 +126 +126 +131 +189 +204 +197 +126 +127 +127 +124 +125 +126 +126 +126 +126 +125 +124 +125 +125 +124 +125 +125 +125 +125 +125 +124 +124 +125 +124 +125 +125 +124 +125 +125 +124 +107 +125 +126 +126 +126 +126 +125 +126 +126 +125 +126 +125 +125 +125 +125 +124 +124 +124 +131 +124 +124 +124 +158 +200 +197 +126 +126 +126 +125 +124 +130 +127 +126 +129 +126 +124 +124 +126 +126 +126 +125 +125 +125 +125 +124 +124 +125 +128 +112 +125 +125 +125 +125 +125 +125 +124 +126 +126 +125 +124 +124 +124 +110 +127 +127 +126 +125 +129 +127 +129 +126 +125 +126 +126 +124 +126 +125 +126 +127 +125 +126 +125 +125 +125 +124 +124 +125 +126 +126 +125 +124 +127 +126 +131 +127 +125 +126 +125 +126 +126 +125 +127 +125 +125 +125 +125 +125 +189 +208 +197 +125 +126 +124 +125 +124 +124 +125 +125 +125 +125 +129 +125 +125 +117 +127 +125 +125 +125 +127 +190 +207 +197 +126 +126 +125 +125 +125 +125 +125 +125 +125 +126 +125 +125 +125 +125 +126 +126 +127 +125 +125 +124 +125 +108 +130 +126 +126 +126 +127 +125 +126 +126 +125 +126 +125 +125 +125 +167 +200 +197 +125 +129 +125 +125 +125 +124 +126 +126 +126 +124 +124 +107 +124 +127 +126 +126 +126 +125 +125 +203 +201 +198 +126 +126 +125 +125 +125 +124 +125 +107 +125 +134 +130 +126 +126 +125 +125 +126 +127 +126 +127 +126 +125 +109 +129 +125 +126 +126 +126 +125 +126 +125 +129 +125 +126 +125 +125 +166 +200 +198 +126 +126 +127 +125 +126 +125 +126 +126 +127 +125 +125 +126 +125 +108 +130 +126 +126 +125 +126 +126 +126 +126 +125 +126 +126 +125 +125 +177 +197 +198 +110 +112 +125 +126 +125 +124 +126 +127 +126 +126 +125 +125 +125 +132 +200 +183 +183 +182 +146 +142 +141 +137 +138 +139 +131 +132 +132 +132 +131 +130 +130 +131 +125 +125 +125 +126 +126 +126 +125 +125 +132 +131 +126 +132 +131 +132 +131 +126 +132 +131 +131 +130 +129 +131 +132 +126 +126 +126 +126 +125 +124 +125 +125 +132 +139 +132 +131 +130 +132 +132 +128 +131 +118 +136 +132 +132 +131 +131 +127 +126 +126 +126 +126 +126 +126 +126 +131 +132 +132 +131 +130 +130 +131 +127 +131 +131 +132 +131 +130 +131 +131 +132 +129 +135 +132 +132 +132 +132 +128 +131 +132 +132 +119 +141 +132 +136 +126 +125 +126 +126 +126 +108 +126 +126 +126 +126 +125 +125 +126 +126 +126 +125 +125 +125 +109 +125 +125 +125 +126 +125 +125 +125 +125 +125 +125 +126 +126 +126 +125 +108 +126 +126 +109 +108 +108 +108 +128 +130 +126 +126 +126 +126 +128 +125 +184 +198 +203 +126 +127 +126 +126 +125 +127 +127 +126 +126 +128 +126 +126 +126 +125 +125 +125 +125 +126 +126 +168 +200 +198 +127 +126 +127 +126 +125 +109 +127 +126 +125 +127 +125 +126 +130 +126 +127 +126 +125 +125 +126 +127 +126 +108 +126 +127 +125 +126 +125 +126 +130 +126 +125 +125 +125 +126 +126 +127 +124 +109 +126 +127 +126 +126 +126 +125 +126 +126 +125 +126 +126 +126 +126 +127 +126 +125 +125 +126 +125 +125 +125 +125 +126 +126 +125 +125 +125 +125 +125 +125 +127 +127 +127 +126 +126 +117 +130 +139 +133 +133 +131 +138 +131 +125 +130 +129 +129 +130 +127 +127 +127 +124 +123 +123 +124 +124 +124 +123 +123 +125 +133 +136 +133 +132 +131 +134 +128 +131 +130 +130 +130 +128 +129 +129 +126 +123 +124 +124 +124 +123 +124 +124 +126 +133 +137 +132 +136 +133 +133 +128 +130 +128 +131 +130 +129 +128 +128 +127 +132 +133 +133 +133 +131 +133 +131 +123 +126 +129 +124 +124 +124 +123 +125 +130 +132 +128 +128 +130 +130 +130 +126 +132 +134 +134 +134 +133 +132 +129 +126 +124 +124 +123 +124 +123 +123 +124 +131 +130 +131 +130 +129 +129 +129 +129 +123 +123 +124 +124 +124 +123 +123 +125 +133 +129 +121 +121 +109 +126 +125 +125 +123 +120 +125 +121 +108 +175 +178 +188 +145 +140 +140 +140 +135 +130 +111 +109 +128 +129 +129 +130 +129 +121 +120 +119 +121 +121 +121 +120 +120 +120 +130 +131 +130 +130 +135 +131 +131 +128 +129 +130 +134 +129 +129 +128 +127 +120 +120 +120 +120 +120 +121 +120 +120 +121 +131 +134 +131 +129 +129 +131 +128 +129 +130 +129 +129 +129 +128 +129 +121 +131 +131 +130 +130 +129 +132 +129 +122 +121 +121 +126 +126 +126 +124 +125 +126 +130 +110 +129 +128 +128 +128 +128 +127 +127 +129 +128 +128 +128 +128 +121 +132 +131 +131 +131 +131 +130 +129 +126 +123 +123 +126 +125 +125 +125 +124 +124 +272 +222 +130 +129 +129 +121 +120 +121 +121 +121 +120 +120 +121 +121 +131 +131 +131 +130 +129 +129 +133 +130 +128 +128 +128 +128 +128 +128 +121 +131 +138 +131 +130 +129 +131 +112 +130 +126 +126 +121 +121 +121 +121 +122 +125 +125 +125 +125 +125 +125 +124 +125 +125 +125 +125 +127 +127 +126 +126 +125 +125 +125 +112 +128 +126 +125 +124 +128 +126 +126 +125 +129 +126 +126 +126 +125 +186 +201 +196 +126 +126 +126 +124 +125 +125 +125 +124 +124 +125 +126 +126 +126 +117 +126 +126 +126 +126 +126 +126 +125 +124 +127 +126 diff --git a/programming_examples/basic/passthrough_kernel/time_py.txt b/programming_examples/basic/passthrough_kernel/time_py.txt new file mode 100644 index 0000000000..3c0646f78b --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/time_py.txt @@ -0,0 +1,1000 @@ +416 +285 +279 +203 +193 +173 +142 +139 +139 +137 +137 +137 +137 +142 +197 +199 +142 +145 +151 +152 +150 +151 +151 +150 +150 +125 +142 +140 +140 +140 +151 +140 +138 +136 +135 +136 +140 +141 +135 +135 +139 +139 +139 +138 +137 +137 +134 +130 +134 +133 +140 +131 +130 +133 +123 +134 +140 +140 +138 +138 +114 +144 +132 +133 +125 +122 +111 +179 +193 +155 +154 +120 +131 +127 +126 +123 +126 +123 +123 +124 +126 +126 +126 +127 +141 +134 +132 +132 +133 +134 +130 +128 +128 +128 +127 +128 +127 +128 +128 +134 +138 +147 +135 +132 +132 +127 +134 +134 +135 +135 +134 +133 +126 +117 +128 +130 +128 +128 +128 +127 +127 +127 +127 +127 +127 +128 +129 +127 +127 +133 +133 +134 +134 +132 +132 +129 +128 +133 +139 +128 +128 +127 +127 +122 +135 +134 +134 +136 +130 +133 +125 +134 +141 +135 +134 +132 +134 +129 +128 +127 +130 +127 +127 +127 +124 +128 +132 +133 +132 +132 +132 +132 +139 +146 +139 +125 +134 +135 +133 +132 +128 +127 +127 +127 +117 +124 +123 +129 +132 +130 +131 +132 +131 +131 +131 +124 +134 +134 +134 +132 +132 +134 +112 +130 +122 +123 +123 +123 +123 +121 +123 +134 +140 +143 +132 +132 +133 +130 +132 +131 +131 +132 +122 +142 +128 +131 +134 +133 +133 +134 +133 +132 +135 +142 +132 +131 +131 +131 +131 +131 +123 +134 +135 +134 +134 +133 +134 +128 +129 +127 +134 +127 +124 +127 +129 +132 +132 +132 +132 +132 +131 +131 +111 +140 +133 +132 +132 +132 +132 +129 +126 +127 +127 +127 +125 +127 +126 +128 +127 +127 +132 +131 +131 +131 +132 +131 +131 +141 +129 +129 +132 +131 +111 +140 +134 +133 +131 +121 +145 +133 +127 +127 +127 +128 +124 +127 +127 +128 +132 +131 +131 +131 +131 +131 +131 +123 +133 +133 +134 +133 +132 +132 +111 +125 +123 +140 +129 +136 +137 +189 +273 +257 +263 +208 +199 +195 +193 +193 +195 +195 +168 +142 +134 +140 +134 +137 +135 +136 +135 +133 +137 +142 +131 +130 +129 +128 +133 +128 +128 +134 +133 +115 +137 +132 +132 +138 +132 +132 +131 +132 +132 +132 +133 +126 +128 +129 +137 +148 +148 +147 +141 +127 +127 +127 +127 +127 +120 +123 +132 +131 +131 +131 +131 +131 +131 +123 +122 +110 +138 +129 +129 +126 +125 +111 +130 +129 +124 +129 +124 +125 +124 +128 +127 +134 +133 +144 +133 +132 +131 +131 +131 +132 +132 +131 +132 +130 +136 +136 +135 +135 +133 +133 +133 +127 +145 +145 +143 +145 +145 +144 +122 +122 +126 +123 +122 +123 +123 +123 +134 +135 +135 +135 +133 +133 +134 +127 +133 +147 +136 +131 +131 +131 +112 +154 +142 +141 +142 +148 +141 +138 +128 +127 +127 +116 +127 +128 +127 +132 +134 +131 +131 +131 +131 +131 +124 +123 +123 +123 +123 +122 +123 +122 +112 +123 +124 +135 +136 +136 +134 +130 +130 +140 +128 +126 +127 +126 +127 +127 +131 +136 +132 +115 +132 +132 +130 +123 +122 +123 +122 +122 +123 +123 +125 +142 +135 +135 +133 +133 +135 +135 +129 +132 +132 +131 +130 +131 +131 +124 +122 +124 +140 +129 +126 +127 +128 +124 +129 +142 +123 +122 +123 +122 +122 +233 +147 +183 +149 +148 +129 +149 +167 +148 +144 +147 +145 +141 +128 +128 +127 +128 +128 +128 +132 +135 +141 +139 +135 +134 +135 +132 +143 +129 +118 +130 +128 +127 +127 +127 +127 +127 +127 +127 +127 +127 +127 +128 +136 +136 +136 +135 +134 +134 +130 +135 +133 +134 +136 +132 +132 +134 +146 +138 +132 +131 +132 +132 +131 +126 +134 +171 +268 +256 +184 +185 +184 +184 +193 +195 +165 +188 +194 +168 +127 +140 +132 +132 +132 +131 +132 +124 +123 +123 +132 +123 +122 +123 +122 +122 +134 +133 +134 +137 +131 +132 +130 +127 +128 +132 +132 +131 +131 +130 +116 +132 +132 +130 +131 +132 +131 +131 +145 +134 +135 +133 +132 +133 +132 +129 +132 +131 +138 +131 +131 +131 +124 +123 +123 +123 +123 +123 +123 +122 +109 +135 +134 +133 +133 +134 +134 +132 +128 +132 +132 +132 +131 +132 +131 +124 +123 +122 +123 +123 +124 +127 +123 +123 +134 +139 +134 +132 +133 +142 +129 +134 +132 +132 +137 +129 +128 +131 +123 +122 +123 +125 +122 +123 +122 +123 +123 +134 +134 +134 +132 +131 +131 +128 +127 +132 +132 +132 +131 +131 +138 +123 +123 +141 +128 +127 +126 +127 +130 +132 +186 +264 +267 +195 +194 +193 +193 +194 +188 +184 +185 +190 +159 +135 +128 +127 +123 +123 +126 +123 +123 +126 +124 +134 +124 +124 +125 +123 +123 +123 +126 +124 +119 +129 +123 +139 +133 +122 +122 +122 +122 +123 +122 +123 +115 +123 +123 +122 +123 +122 +122 +123 +122 +126 +122 +122 +122 +122 +123 +122 +122 +122 +129 +122 +122 +122 +122 +122 +122 +123 +109 +124 +128 +127 +124 +133 +129 +123 +127 +123 +124 +126 +123 +123 +123 +123 +126 +123 +123 +123 +123 +123 +123 +126 +123 +123 +126 +123 +126 +127 +125 +128 +133 +124 +127 +122 +122 +133 +124 +124 +123 +114 +124 +143 +130 +123 +123 +122 +122 +122 +122 +122 +123 +122 +109 +182 +198 +195 +145 +127 +128 +126 +127 +123 +124 +123 +260 +195 +155 +150 +153 +144 +133 +132 +131 +130 +135 +129 +129 +123 +127 +128 +126 +127 +124 +132 +124 +127 +132 +132 +132 +132 +132 +144 +114 +147 +146 +145 +146 +145 +144 +134 +127 +138 +135 +123 +122 +122 +130 +132 +131 +131 +115 +132 +129 +128 +131 +132 +134 +133 +133 +133 +133 +131 +136 +131 +131 +131 +132 +132 +130 +122 +134 +134 +134 +131 +138 +133 +128 +126 +128 +126 +127 +126 +124 +127 +127 +133 +132 +112 +118 +131 +131 +131 +137 +134 +136 +133 +134 +133 +133 +129 +128 +127 +128 +128 +128 +127 diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index e969be08f3..4b43a14735 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -18,57 +18,57 @@ targetname = vectorScalar data_size = 4096 trace_size = 8192 -all: build/final_${data_size}.xclbin build/insts_${data_size}.txt +all: build/final.xclbin build/insts.txt -kristof: build/insts_${data_size}.txt +kristof: build/insts.txt build/%.o: %.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} -build/aie_${data_size}.mlir: ${srcdir}/aie2.py +build/aie.mlir: ${srcdir}/aie2.py mkdir -p ${@D} python3 $< ${data_size} 0 > $@ -build/aie_trace_${data_size}.mlir: aie2.py +build/aie_trace.mlir: aie2.py mkdir -p ${@D} python3 $< ${data_size} ${trace_size} > $@ -#build/insts_${data_size}.txt: build/final_${data_size}.xclbin -build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o +#build/insts.txt: build/final.xclbin +build/final.xclbin: build/aie.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) -build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o +build/final_trace.xclbin: build/aie_trace.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) -${targetname}_${data_size}.exe: ${srcdir}/test.cpp +${targetname}.exe: ${srcdir}/test.cpp rm -rf _build mkdir -p _build - cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}_${data_size} -DVECTORSCALARMUL_SIZE=${data_size} + cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DVECTORSCALARMUL_SIZE=${data_size} cd _build && ${powershell} cmake --build . --config Release ifeq "${powershell}" "powershell.exe" - cp _build/${targetname}_${data_size}.exe $@ + cp _build/${targetname}.exe $@ else - cp _build/${targetname}_${data_size} $@ + cp _build/${targetname} $@ endif -run: ${targetname}_${data_size}.exe build/final_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} python3 ${srcdir}/test.py -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -s ${data_size} +run_py: build/final.xclbin build/insts.txt + ${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -s ${data_size} -trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} - ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json +trace: ${targetname}.exe build/final_trace.xclbin build/insts.txt + ${powershell} ./$< -x build/final_trace.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} + ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json -trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size} - ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json +trace_py: build/final_trace.xclbin build/insts.txt + ${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} -s ${data_size} + ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json clean_trace: diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index dd02a1010c..8d5327b3e7 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -18,7 +18,7 @@ def my_vector_scalar(vector_size, trace_size): N = vector_size - N_in_bytes = N * 2 + N_in_bytes = N * 4 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n @@ -28,17 +28,17 @@ def my_vector_scalar(vector_size, trace_size): @device(AIEDevice.npu1_1col) def device_body(): - memRef_ty = T.memref(n, T.i16()) + memRef_ty = T.memref(n, T.i32()) memRef_ty2 = T.memref(1, T.i32()) # AIE Core Function declarations scale_scalar = external_func( - "vector_scalar_mul_int16_scalar", + "vector_scalar_mul_int32_scalar", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()], ) scale = external_func( - "vector_scalar_mul_int16_vector", + "vector_scalar_mul_int32_vector", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()], ) @@ -80,7 +80,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i16()) + tensor_ty = T.memref(N, T.i32()) scalar_ty = T.memref(1, T.i32()) @runtime_sequence(tensor_ty, scalar_ty, tensor_ty) diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp index d4acb04292..0a454cc628 100644 --- a/programming_examples/basic/vector_scalar_mul/test.cpp +++ b/programming_examples/basic/vector_scalar_mul/test.cpp @@ -22,8 +22,8 @@ // Configure this to match your buffer data type // ------------------------------------------------------ // using DATATYPE = std::uint8_t; -// using DATATYPE = std::uint32_t; -using DATATYPE = std::uint16_t; +using DATATYPE = std::uint32_t; +// using DATATYPE = std::uint16_t; #endif const int scaleFactor = 3; @@ -100,12 +100,16 @@ int main(int argc, const char *argv[]) { bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE); // Execute the kernel and wait to finish + auto start = std::chrono::high_resolution_clock::now(); if (verbosity >= 1) std::cout << "Running Kernel.\n"; unsigned int opcode = 3; auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + std::cout << "NPU time: " << npu_time << "us." << std::endl; // Sync device to host memories bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE); @@ -128,10 +132,10 @@ int main(int argc, const char *argv[]) { } } - if (trace_size > 0) { - test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, - vm["trace_file"].as()); - } + // if (trace_size > 0) { + // test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, + // vm["trace_file"].as()); + // } // Print Pass/Fail result of our test if (!errors) { diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py index 414d62fa26..b32341c0ba 100644 --- a/programming_examples/basic/vector_vector_mul/aie2.py +++ b/programming_examples/basic/vector_vector_mul/aie2.py @@ -18,7 +18,7 @@ def my_vector_mul(): - N = 256 + N = 256*60 n = 16 N_div_n = N // n diff --git a/programming_examples/basic/vector_vector_mul/test.cpp b/programming_examples/basic/vector_vector_mul/test.cpp index 52af9beb06..ab43b12bdf 100644 --- a/programming_examples/basic/vector_vector_mul/test.cpp +++ b/programming_examples/basic/vector_vector_mul/test.cpp @@ -40,8 +40,8 @@ int main(int argc, const char *argv[]) { int n_warmup_iterations = vm["warmup"].as(); int trace_size = vm["trace_sz"].as(); - constexpr int IN_SIZE = 256; - constexpr int OUT_SIZE = 256; + constexpr int IN_SIZE = 256*60; + constexpr int OUT_SIZE = IN_SIZE; // Load instruction sequence std::vector instr_v = @@ -132,8 +132,20 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; unsigned int opcode = 3; - auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); + + std::ofstream f_time; + f_time.open("time.txt"); + for (int i=1; i<=1000; i++) { + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + if (i<11) + std::cout << i << " " << IN_SIZE << " NPU time: " << npu_time << "us." << std::endl; + f_time << npu_time << "\n"; + } + f_time.close(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir new file mode 100644 index 0000000000..fe8c67d906 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir @@ -0,0 +1,17 @@ +//===- aie1.mlir -----------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + } +} diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir new file mode 100644 index 0000000000..2b982871c0 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir @@ -0,0 +1,47 @@ +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @in1(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @in2(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c9223372036854775807 = arith.constant 9223372036854775807 : index + %c1 = arith.constant 1 : index + scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { + %c0_0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1_1 = arith.constant 1 : index + scf.for %arg1 = %c0_0 to %c256 step %c1_1 { + %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview> + %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16xi32> + %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview> + %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<16xi32> + %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> + %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<16xi32> + %c0_2 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c1_3 = arith.constant 1 : index + scf.for %arg2 = %c0_2 to %c16 step %c1_3 { + %6 = memref.load %1[%arg2] : memref<16xi32> + %7 = memref.load %3[%arg2] : memref<16xi32> + %8 = arith.muli %6, %7 : i32 + memref.store %8, %5[%arg2] : memref<16xi32> + } + aie.objectfifo.release @in1(Consume, 1) + aie.objectfifo.release @in2(Consume, 1) + aie.objectfifo.release @out(Produce, 1) + } + } + aie.end + } + aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<4096xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + } + } +} + diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir new file mode 100644 index 0000000000..575c684839 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir @@ -0,0 +1,436 @@ +module { + aie.device(npu1_1col) { + %tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} + %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + memref.global "private" constant @blockwrite_data : memref<9xi32> = dense<[1440, 0, 0, 0, 0, 0, 0, 0, 8]> + memref.global "private" constant @blockwrite_data_0 : memref<444xi32> = dense<"0xmemref.global "private" constant @blockwrite_data_1 : memref<6xi32> = dense<[5242896, 0, 0, 0, 0, 235159520]> + memref.global "private" constant @blockwrite_data_2 : memref<6xi32> = dense<[5505040, 0, 0, 0, 0, 100941792]> + memref.global "private" constant @blockwrite_data_3 : memref<6xi32> = dense<[4718608, 0, 0, 0, 0, 503611362]> + memref.global "private" constant @blockwrite_data_4 : memref<6xi32> = dense<[4980752, 0, 0, 0, 0, 369393634]> + memref.global "private" constant @blockwrite_data_5 : memref<6xi32> = dense<[4194320, 0, 0, 0, 0, 772055013]> + memref.global "private" constant @blockwrite_data_6 : memref<6xi32> = dense<[4456464, 0, 0, 0, 0, 637837285]> + aiex.runtime_sequence(%arg0: memref<1024xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 2][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 4][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 6][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 8][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 10][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 15][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 20][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 22][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 27][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 37][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 42][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 47][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 52][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 57][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 62][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 67][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 72][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 77][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 82][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 87][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 92][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 97][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 102][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 107][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 112][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 117][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 122][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 127][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 132][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 137][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 142][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 147][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 152][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 157][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 162][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 167][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 172][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 177][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 182][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 187][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 197][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 202][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 207][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 212][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 217][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 222][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 227][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 232][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 237][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 242][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 247][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 252][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 257][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 262][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 267][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 272][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 277][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 282][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 287][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 292][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 297][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 302][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 307][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 312][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 317][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 322][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 327][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 332][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 337][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 342][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 347][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 352][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 357][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 362][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 367][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 372][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 377][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 382][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 387][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 392][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 397][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 402][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 407][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 412][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 417][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 422][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 427][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 432][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 437][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 442][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 447][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 452][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 457][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 462][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 467][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 472][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 477][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 482][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 487][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 492][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 497][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 502][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 507][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 512][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 517][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 522][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 527][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 532][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 537][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 542][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 547][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 552][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 557][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 562][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 567][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 572][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 577][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 579][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 581][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 583][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 585][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 587][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 589][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 591][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 593][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 595][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 597][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 599][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 601][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 603][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 605][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 607][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 609][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 611][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 613][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 615][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 617][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 619][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 621][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 623][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 625][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 627][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 629][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 631][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 633][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 635][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 637][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 639][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 641][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 643][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 645][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 650][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 653][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 658][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 661][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 666][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 669][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 674][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 677][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 682][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 685][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 690][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 693][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 695][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 697][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 699][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 701][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 703][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 705][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 707][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 709][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 711][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 713][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 715][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 717][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 719][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 721][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 723][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 725][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 727][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 729][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 731][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 733][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 735][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 737][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 739][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 741][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 743][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 745][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 747][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 749][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 751][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 753][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 755][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 757][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 759][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 761][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + } + aie.packet_flow(15) { + aie.packet_source<%tile_0_0, Ctrl : 0> + aie.packet_dest<%tile_0_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(26) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_1, Ctrl : 0> + } {keep_pkt_header = true, priority_route = true} + aie.shim_dma_allocation @ctrlpkt_col0_mm2s_chan0(MM2S, 0, 0) + memref.global "public" @ctrlpkt_col0_mm2s_chan0 : memref<2048xi32> + aie.packet_flow(15) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_0, Ctrl : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(27) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, Ctrl : 0> + } {keep_pkt_header = true, priority_route = true} + } +} + diff --git a/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir b/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir new file mode 100644 index 0000000000..0efe95b979 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir @@ -0,0 +1,173 @@ +module { + aie.device(npu1_1col) { + memref.global "public" @out_cons : memref<16xi32> + memref.global "public" @out : memref<16xi32> + memref.global "public" @in2_cons : memref<16xi32> + memref.global "public" @in2 : memref<16xi32> + memref.global "public" @in1_cons : memref<16xi32> + memref.global "public" @in1 : memref<16xi32> + %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + %out_cons_prod_lock = aie.lock(%tile_0_0, 4) {init = 1 : i32, sym_name = "out_cons_prod_lock"} + %out_cons_cons_lock = aie.lock(%tile_0_0, 5) {init = 0 : i32, sym_name = "out_cons_cons_lock"} + %out_buff_0 = aie.buffer(%tile_0_2) {address = 1024 : i32, sym_name = "out_buff_0"} : memref<16xi32> + %out_buff_1 = aie.buffer(%tile_0_2) {address = 1088 : i32, sym_name = "out_buff_1"} : memref<16xi32> + %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"} + %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"} + %in2_cons_buff_0 = aie.buffer(%tile_0_2) {address = 1152 : i32, sym_name = "in2_cons_buff_0"} : memref<16xi32> + %in2_cons_buff_1 = aie.buffer(%tile_0_2) {address = 1216 : i32, sym_name = "in2_cons_buff_1"} : memref<16xi32> + %in2_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_cons_prod_lock"} + %in2_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_cons_cons_lock"} + %in2_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "in2_prod_lock"} + %in2_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "in2_cons_lock"} + %in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 1280 : i32, sym_name = "in1_cons_buff_0"} : memref<16xi32> + %in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 1344 : i32, sym_name = "in1_cons_buff_1"} : memref<16xi32> + %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"} + %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"} + %in1_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "in1_prod_lock"} + %in1_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "in1_cons_lock"} + + + aie.packet_flow(1) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, DMA : 0> + } + aie.packet_flow(2) { + aie.packet_source<%tile_0_0, DMA : 1> + aie.packet_dest<%tile_0_2, DMA : 1> + } + aie.packet_flow(3) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, DMA : 0> + } + + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c9223372036854775807 = arith.constant 9223372036854775807 : index + %c1 = arith.constant 1 : index + cf.br ^bb1(%c0 : index) + ^bb1(%0: index): // 2 preds: ^bb0, ^bb11 + %1 = arith.cmpi slt, %0, %c9223372036854775807 : index + cf.cond_br %1, ^bb2, ^bb12 + ^bb2: // pred: ^bb1 + %c0_0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c1_1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + cf.br ^bb3(%c0_0 : index) + ^bb3(%2: index): // 2 preds: ^bb2, ^bb10 + %3 = arith.cmpi slt, %2, %c256 : index + cf.cond_br %3, ^bb4, ^bb11 + ^bb4: // pred: ^bb3 + aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%in2_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) + %c0_2 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c1_3 = arith.constant 1 : index + cf.br ^bb5(%c0_2 : index) + ^bb5(%4: index): // 2 preds: ^bb4, ^bb6 + %5 = arith.cmpi slt, %4, %c16 : index + cf.cond_br %5, ^bb6, ^bb7 + ^bb6: // pred: ^bb5 + %6 = memref.load %in1_cons_buff_0[%4] : memref<16xi32> + %7 = memref.load %in2_cons_buff_0[%4] : memref<16xi32> + %8 = arith.muli %6, %7 : i32 + memref.store %8, %out_buff_0[%4] : memref<16xi32> + %9 = arith.addi %4, %c1_3 : index + cf.br ^bb5(%9 : index) + ^bb7: // pred: ^bb5 + aie.use_lock(%in1_cons_prod_lock, Release, 1) + aie.use_lock(%in2_cons_prod_lock, Release, 1) + aie.use_lock(%out_cons_lock, Release, 1) + aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%in2_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1) + %c0_4 = arith.constant 0 : index + %c16_5 = arith.constant 16 : index + %c1_6 = arith.constant 1 : index + cf.br ^bb8(%c0_4 : index) + ^bb8(%10: index): // 2 preds: ^bb7, ^bb9 + %11 = arith.cmpi slt, %10, %c16_5 : index + cf.cond_br %11, ^bb9, ^bb10 + ^bb9: // pred: ^bb8 + %12 = memref.load %in1_cons_buff_1[%10] : memref<16xi32> + %13 = memref.load %in2_cons_buff_1[%10] : memref<16xi32> + %14 = arith.muli %12, %13 : i32 + memref.store %14, %out_buff_1[%10] : memref<16xi32> + %15 = arith.addi %10, %c1_6 : index + cf.br ^bb8(%15 : index) + ^bb10: // pred: ^bb8 + aie.use_lock(%in1_cons_prod_lock, Release, 1) + aie.use_lock(%in2_cons_prod_lock, Release, 1) + aie.use_lock(%out_cons_lock, Release, 1) + %16 = arith.addi %2, %c2 : index + cf.br ^bb3(%16 : index) + ^bb11: // pred: ^bb3 + %17 = arith.addi %0, %c1 : index + cf.br ^bb1(%17 : index) + ^bb12: // pred: ^bb1 + aie.end + } + aie.shim_dma_allocation @in1(MM2S, 0, 0) + aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = ) {id = 2 : i64, metadata = @out} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = ) {id = 0 : i64, metadata = @in1} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = ) {id = 1 : i64, metadata = @in2} : memref<4096xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + } + aie.shim_dma_allocation @in2(MM2S, 1, 0) + aie.shim_dma_allocation @out(S2MM, 0, 0) + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16) {bd_id = 0 : i32, next_bd_id = 1 : i32} + aie.use_lock(%in1_cons_cons_lock, Release, 1) + aie.next_bd ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16) {bd_id = 1 : i32, next_bd_id = 0 : i32} + aie.use_lock(%in1_cons_cons_lock, Release, 1) + aie.next_bd ^bb1 + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6) + ^bb4: // 2 preds: ^bb3, ^bb5 + aie.use_lock(%in2_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%in2_cons_buff_0 : memref<16xi32>, 0, 16) {bd_id = 2 : i32, next_bd_id = 3 : i32} + aie.use_lock(%in2_cons_cons_lock, Release, 1) + aie.next_bd ^bb5 + ^bb5: // pred: ^bb4 + aie.use_lock(%in2_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%in2_cons_buff_1 : memref<16xi32>, 0, 16) {bd_id = 3 : i32, next_bd_id = 2 : i32} + aie.use_lock(%in2_cons_cons_lock, Release, 1) + aie.next_bd ^bb4 + ^bb6: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9) + ^bb7: // 2 preds: ^bb6, ^bb8 + aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16) {bd_id = 4 : i32, next_bd_id = 5 : i32} + aie.use_lock(%out_prod_lock, Release, 1) + aie.next_bd ^bb8 + ^bb8: // pred: ^bb7 + aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16) {bd_id = 5 : i32, next_bd_id = 4 : i32} + aie.use_lock(%out_prod_lock, Release, 1) + aie.next_bd ^bb7 + ^bb9: // pred: ^bb6 + aie.end + } + aie.packet_flow(15) { + aie.packet_source<%tile_0_0, Ctrl : 0> + aie.packet_dest<%tile_0_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(15) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_0, Ctrl : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(27) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, Ctrl : 0> + } {keep_pkt_header = true, priority_route = true} + } +} diff --git a/programming_examples/basic/vvmul-merged-ctrl/run.lit b/programming_examples/basic/vvmul-merged-ctrl/run.lit new file mode 100644 index 0000000000..bd0094f177 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/run.lit @@ -0,0 +1,14 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie1.xclbin --npu-insts-name=insts1.txt %S/aie1.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --npu-insts-name=insts2.txt %S/aie2.mlir +// RUN: aie-translate -aie-ctrlpkt-to-bin aie2.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt +// RUN: aie-opt -aie-ctrl-packet-infer-tiles -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" -aie-ctrl-packet-to-dma aie2.mlir.prj/ctrlpkt.mlir > aie3.mlir +// RUN: %python aiecc.py --no-aiesim --aie-only-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=aie3.xclbin --npu-insts-name=insts3.txt aie3.mlir + +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/basic/vvmul-merged-ctrl/run.sh b/programming_examples/basic/vvmul-merged-ctrl/run.sh new file mode 100644 index 0000000000..1b4864bee2 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/run.sh @@ -0,0 +1,7 @@ +python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie1.xclbin --npu-insts-name=insts1.txt ./aie1.mlir +python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --npu-insts-name=insts2.txt ./aie2.mlir +aie-translate -aie-ctrlpkt-to-bin aie2.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt +aie-opt -aie-ctrl-packet-infer-tiles -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" -aie-ctrl-packet-to-dma aie2.mlir.prj/ctrlpkt.mlir > aie3.mlir +python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-only-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=aie3.xclbin --npu-insts-name=insts3.txt aie3.mlir +clang ./test.cpp -o test.exe -std=c++11 -Wall -I/opt/xilinx/xrt/include -L/opt/xilinx/xrt/lib -luuid -lxrt_coreutil -lrt -lstdc++ -lboost_program_options -lboost_filesystem +./test.exe \ No newline at end of file diff --git a/programming_examples/basic/vvmul-merged-ctrl/test.cpp b/programming_examples/basic/vvmul-merged-ctrl/test.cpp new file mode 100644 index 0000000000..d6e47ab306 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-ctrl/test.cpp @@ -0,0 +1,183 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "experimental/xrt_kernel.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 4096; +constexpr int OUT_SIZE = IN_SIZE; +constexpr int CTRL_IN_SIZE = 1024; + +#define IN_DATATYPE std::uint32_t +#define OUT_DATATYPE std::uint32_t + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + // AIE design's data streams + std::vector instr2_v = load_instr_sequence("insts2.txt"); + // AIE configuration as control packet streams + std::vector instr3_cfg_v = load_instr_sequence("insts3.txt"); + // AIE configuration control packets' raw data + std::vector ctrlPackets = load_instr_sequence("ctrlpkt.txt"); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + // Skeleton xclbin containing only the control packet network + auto xclbin = xrt::xclbin("aie1.xclbin"); + + // std::string Node = "MLIR_AIE"; + + // // Get the kernel from the xclbin + // auto xkernels = xclbin.get_kernels(); + // auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + // [Node](xrt::xclbin::kernel &k) { + // auto name = k.get_name(); + // std::cout << "Name: " << name << std::endl; + // return name.rfind(Node, 0) == 0; + // }); + // auto kernelName = xkernel.get_name(); + auto kernelName = "MLIR_AIE"; + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr3 = xrt::bo(device, instr3_cfg_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_instr2 = xrt::bo(device, instr2_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + auto bo_ctrlpkt = xrt::bo(device, CTRL_IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + + IN_DATATYPE *bufInA = bo_inA.map(); + IN_DATATYPE *bufInB = bo_inB.map(); + std::vector srcVecA, srcVecB; + for (int i = 0; i < IN_SIZE; i++) { + srcVecA.push_back(1); + srcVecB.push_back(1); + } + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE))); + memcpy(bufInB, srcVecB.data(), (srcVecB.size() * sizeof(IN_DATATYPE))); + + void *bufInstr2 = bo_instr2.map(); + memcpy(bufInstr2, instr2_v.data(), instr2_v.size() * sizeof(int)); + + void *bufInstr3 = bo_instr3.map(); + memcpy(bufInstr3, instr3_cfg_v.data(), instr3_cfg_v.size() * sizeof(int)); + + void *bufctrlpkt = bo_ctrlpkt.map(); + memcpy(bufctrlpkt, ctrlPackets.data(), ctrlPackets.size() * sizeof(int)); + + // Synchronizing BOs + bo_instr3.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_ctrlpkt.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + + // Creating a runlist to contain two seperate runs + xrt::runlist runlist = xrt::runlist(context); + + // Run 0: configuration + auto run0 = xrt::run(kernel); + run0.set_arg(0, opcode); + run0.set_arg(1, bo_instr3); + run0.set_arg(2, instr3_cfg_v.size()); + run0.set_arg(3, bo_ctrlpkt); + run0.set_arg(4, 0); + run0.set_arg(5, 0); + run0.set_arg(6, 0); + run0.set_arg(7, 0); + // Run 1: the design + auto run1 = xrt::run(kernel); + run1.set_arg(0, opcode); + run1.set_arg(1, bo_instr2); + run1.set_arg(2, instr2_v.size()); + run1.set_arg(3, bo_inA); + run1.set_arg(4, bo_inB); + run1.set_arg(5, bo_out); + run1.set_arg(6, 0); + run1.set_arg(7, 0); + + // Executing and waiting on the runlist + runlist.add(run0); + runlist.add(run1); + runlist.execute(); + runlist.wait(); + + // bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // OUT_DATATYPE *bufOut = bo_out.map(); + + // int errors = 0; + + // for (uint32_t i = 0; i < 64; i++) { + // for (uint32_t j = 0; j < 64; j++) { + // uint32_t ref = 1 + 12; + // if (*(bufOut + i * 64 + j) != ref) { + // std::cout << "Error in output " << std::to_string(bufOut[i * 64 + j]) + // << " != " << ref << std::endl; + // errors++; + // } + // // else + // // std::cout << "Correct output " << std::to_string(bufOut[i * 64 + j]) + // // << " == " << ref << std::endl; + // } + // } + + // if (!errors) { + // std::cout << "\nPASS!\n\n"; + // return 0; + // } + + // std::cout << "\nfailed.\n\n"; + // return 1; +} diff --git a/programming_examples/basic/vvmul-merged-ctrl/test.exe b/programming_examples/basic/vvmul-merged-ctrl/test.exe new file mode 100755 index 0000000000..f75430a527 Binary files /dev/null and b/programming_examples/basic/vvmul-merged-ctrl/test.exe differ diff --git a/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt b/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt new file mode 100644 index 0000000000..68cfdbd97f --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt @@ -0,0 +1,77 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC + DATASIZE=${DATASIZE} + DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/vvmul-merged-xclbin/Makefile b/programming_examples/basic/vvmul-merged-xclbin/Makefile new file mode 100755 index 0000000000..d6f5e4174c --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/Makefile @@ -0,0 +1,84 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../makefile-common + +data_size = 8192 +targetname = vectorMult +devicename = npu +col = 0 + +VPATH := ${srcdir}/../../../aie_kernels/aie2 + +build2/%.o: %.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + + +all: build/final.xclbin + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} ${data_size} > $@ + +build/final.xclbin: build/aie.mlir build2/final.xclbin + mkdir -p ${@D} + cd ${@D} && aiecc.py --xclbin-kernel-name=VVM --xclbin-kernel-id=0x902 \ + --xclbin-instance-name=VVMINST --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --xclbin-input=../build2/final.xclbin \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +build2/aie.mlir: ../vector_scalar_mul/aie2.py + mkdir -p ${@D} + python3 $< ${data_size} 0 > $@ + +build2/final.xclbin: build2/aie.mlir build2/scale.o + cd build2/ && aiecc.py --xclbin-kernel-name=VSM --xclbin-kernel-id=0x901 \ + --xclbin-instance-name=VSMINST --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=final.xclbin \ + --aie-generate-npu --npu-insts-name=insts.txt aie.mlir + + +${targetname}.exe: ${srcdir}/test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DDATASIZE=${data_size} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + + + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 + +vck5000: build/aie.mlir + aiecc.py --xchesscc --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + ${srcdir}/test_vck5000.cpp \ + ${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< --xclbin build/final.xclbin \ + --instr build/insts.txt --instr_2 build2/insts.txt -k VVM + +clean: + rm -rf build build2 _build inst aie.mlir.prj core_* test.elf ${targetname}.exe build2/final.xclbin ../vector_scalar_mul/aie.mlir diff --git a/programming_examples/basic/vvmul-merged-xclbin/README.md b/programming_examples/basic/vvmul-merged-xclbin/README.md new file mode 100644 index 0000000000..56b0a6cde0 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/README.md @@ -0,0 +1,57 @@ + + +# Vector Vector Multiply + +A simple binary operator, which uses a single AIE core to multiply two vectors together. The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`. It shows how simple it can be to just feed data into the AIEs using the ObjectFIFO abstraction, and drain the results back to external memory. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. + +The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory. + +## Source Files Overview + +1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +1. `test.cpp`: This C++ code is a testbench for the design example targetting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results. + +1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targetting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results. + +## Ryzen™ AI Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +make vectorAdd.exe +``` + +To run the design: + +``` +make run +``` + +## VCK5000 Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make vck5000 +``` + +To run the design: + +``` +./test.elf +``` + diff --git a/programming_examples/basic/vvmul-merged-xclbin/aie2.py b/programming_examples/basic/vvmul-merged-xclbin/aie2.py new file mode 100644 index 0000000000..2400c6c36a --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/aie2.py @@ -0,0 +1,94 @@ +# vector_vector_mul/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + +import sys + + +def my_vector_mul(vector_size): + N = vector_size + n = 16 + N_div_n = N // n + + buffer_depth = 2 + + if len(sys.argv) != 4: + raise ValueError("[ERROR] Need 3 command line arguments (Device name, Col, size)") + + if sys.argv[1] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_ty = T.memref(n, T.i32()) + + # AIE Core Function declarations + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty) + of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in for_(sys.maxsize): + # Number of sub-vector "tile" iterations + for _ in for_(N_div_n): + elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1) + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + for i in for_(n): + v0 = memref.load(elem_in1, [i]) + v1 = memref.load(elem_in2, [i]) + v2 = arith.muli(v0, v1) + memref.store(v2, elem_out, [i]) + yield_([]) + of_in1.release(ObjectFifoPort.Consume, 1) + of_in2.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + +with mlir_mod_ctx() as ctx: + vector_size = int(sys.argv[3]) + my_vector_mul(vector_size) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit b/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit b/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit new file mode 100644 index 0000000000..dcaa9f99c5 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit @@ -0,0 +1,8 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: hsa, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile vck5000 +// RUN: %run_on_vck5000 ./test.elf diff --git a/programming_examples/basic/vvmul-merged-xclbin/sweep.sh b/programming_examples/basic/vvmul-merged-xclbin/sweep.sh new file mode 100644 index 0000000000..579a545205 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/sweep.sh @@ -0,0 +1,9 @@ +M_lo=4096 +M_step=4096 +M_hi=12288 + +Ms=$(seq $M_lo $M_step $M_hi) + +for M in $Ms; do + make clean && make run data_size=$M +done \ No newline at end of file diff --git a/programming_examples/basic/vvmul-merged-xclbin/test.cpp b/programming_examples/basic/vvmul-merged-xclbin/test.cpp new file mode 100644 index 0000000000..40c2d64f8e --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/test.cpp @@ -0,0 +1,190 @@ + +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +namespace po = boost::program_options; +using TY_2 = std::uint32_t; +const int scaleFactor = 3; + +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + desc.add_options()("help,h", "produce help message")( + "xclbin", po::value()->required(), + "the input xclbin path2")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr", po::value()->required(), + "path of file containing userspace instructions sent to the NPU")( + "instr_2", po::value()->required(), + "path of file containing userspace instructions sent to the NPU")( + "verify", po::value()->default_value(true), + "whether to verify the AIE computed output")( + "iters", po::value()->default_value(1))( + "warmup", po::value()->default_value(0))( + "trace_sz,t", po::value()->default_value(0))( + "trace_file", po::value()->default_value("trace.txt"), + "where to store trace output"); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + constexpr int IN_SIZE_1 = DATASIZE; + constexpr int IN_VOLUME_2 = DATASIZE; + + constexpr int OUT_SIZE_1 = DATASIZE; + constexpr int OUT_VOLUME_2 = DATASIZE; + int IN_SIZE_2 = IN_VOLUME_2 * sizeof(TY_2); + int OUT_SIZE_2 = OUT_VOLUME_2 * sizeof(TY_2) + trace_size; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + std::vector instr_v1 = test_utils::load_instr_sequence(vm["instr"].as()); + auto xclbin = xrt::xclbin(vm["xclbin"].as()); // Load the xclbin + device.register_xclbin(xclbin); // Register xclbin + xrt::hw_context context_1(device, xclbin.get_uuid()); // Get a hardware context + auto kernel_1 = xrt::kernel(context_1, "VVM"); + + std::vector instr_v2 = test_utils::load_instr_sequence(vm["instr_2"].as()); + auto kernel_2 = xrt::kernel(context_1, "VSM"); // Get a kernel_2 handle: MLIR_AIE + + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + + auto bo_instr_1 = xrt::bo(device, instr_v1.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_1.group_id(1)); + void *bufInstr_1 = bo_instr_1.map(); + memcpy(bufInstr_1, instr_v1.data(), instr_v1.size() * sizeof(int)); + + auto bo_inA_1 = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(3)); + uint32_t *bufInA_1 = bo_inA_1.map(); + std::vector srcVecA_1; + for (int i = 0; i < IN_SIZE_1; i++) + srcVecA_1.push_back(i + 1); + memcpy(bufInA_1, srcVecA_1.data(), (srcVecA_1.size() * sizeof(uint32_t))); + + auto bo_inB_1 = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(4)); + uint32_t *bufInB_1 = bo_inB_1.map(); + std::vector srcVecB_1; + for (int i = 0; i < IN_SIZE_1; i++) + srcVecB_1.push_back(i); + memcpy(bufInB_1, srcVecB_1.data(), (srcVecB_1.size() * sizeof(uint32_t))); + + auto bo_out_1 = xrt::bo(device, OUT_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(5)); + + + auto bo_instr_2 = xrt::bo(device, instr_v2.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_2.group_id(1)); + void *bufInstr_2 = bo_instr_2.map(); + memcpy(bufInstr_2, instr_v2.data(), instr_v2.size() * sizeof(int)); + + auto bo_inA_2 = xrt::bo(device, IN_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(3)); + TY_2 *bufInA_2 = bo_inA_2.map(); + for (int i = 0; i < IN_VOLUME_2; i++) + bufInA_2[i] = i + 1; + + auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(4)); + int32_t *bufInFactor = bo_inFactor.map(); + *bufInFactor = (TY_2)scaleFactor; + + auto bo_outC_2 = xrt::bo(device, OUT_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(5)); + + + bo_instr_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + bo_instr_2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // RUN KERNEL + + #define SIZE 4096 + + unsigned int opcode = 3; + + std::ofstream f_time; + std::string file_name = "time_" + std::to_string(DATASIZE) + ".txt"; + f_time.open(file_name); + for (int i=1; i<=1000; i++) { + auto start = std::chrono::high_resolution_clock::now(); + + auto run1 = kernel_1(opcode, bo_instr_1, instr_v1.size(), bo_inA_1, bo_inB_1, bo_out_1); + run1.wait(); + auto run2 = kernel_2(opcode, bo_instr_2, instr_v2.size(), bo_inA_2, bo_inFactor, bo_outC_2); + run2.wait(); + + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + if (i<11) + std::cout << i << " " << IN_SIZE_1 << " NPU time: " << npu_time << "us." << std::endl; + f_time << npu_time << "\n"; + } + f_time.close(); + + bo_out_1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bo_outC_2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + uint32_t *bufOut_1 = bo_out_1.map(); + TY_2 *bufOut_2 = bo_outC_2.map(); + + + // COMPARE + int errors = 0; + + for (uint32_t i = 0; i < SIZE; i++) { + if (*(bufOut_1 + i) != *(bufInA_1 + i) * *(bufInB_1 + i)) { + std::cout << "Error in output " << *(bufOut_1 + i) + << " != " << *(bufInA_1 + i) << " * " << *(bufInB_1 + i) + << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << *(bufOut_1 + i) + << " == " << *(bufInA_1 + i) * *(bufInB_1 + i) << std::endl; + } + } + + for (uint32_t i = 0; i < SIZE; i++) { + int32_t ref = bufInA_2[i] * scaleFactor; + int32_t test = bufOut_2[i]; + if (test != ref) { + if (verbosity >= 1) + std::cout << "Error in output " << test << " != " << ref << std::endl; + errors++; + } else { + if (verbosity >= 1) + std::cout << "Correct output " << test << " == " << ref << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed with errors:" << errors << ".\n\n"; + return 1; + } + return 0; +} diff --git a/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp b/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp new file mode 100644 index 0000000000..ab2096bef9 --- /dev/null +++ b/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp @@ -0,0 +1,138 @@ +//===- test_vck5000.cpp -----------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory_allocator.h" +#include "test_library.h" + +#include "aie_data_movement.cpp" +#include "aie_inc.cpp" + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +constexpr int DMA_COUNT = 256; + +void hsa_check_status(const std::string func_name, hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char *status_string(new char[1024]); + hsa_status_string(status, &status_string); + std::cout << func_name << " failed: " << status_string << std::endl; + delete[] status_string; + } else { + std::cout << func_name << " success" << std::endl; + } +} + +int main(int argc, char *argv[]) { + uint64_t row = 0; + uint64_t col = 6; + + std::vector queues; + uint32_t aie_max_queue_size(0); + + aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie(); + + // This is going to initialize HSA, create a queue + // and get an agent + int ret = mlir_aie_init_device(xaie); + + if (ret) { + std::cout << "[ERROR] Error when calling mlir_aie_init_device)" + << std::endl; + return -1; + } + + // Getting access to all of the HSA agents + std::vector agents = xaie->agents; + + if (agents.empty()) { + std::cout << "No agents found. Exiting." << std::endl; + return -1; + } + + std::cout << "Found " << agents.size() << " agents" << std::endl; + + hsa_queue_t *q = xaie->cmd_queue; + + // Adding to our vector of queues + queues.push_back(q); + assert(queues.size() > 0 && "No queues were sucesfully created!"); + + mlir_aie_configure_cores(xaie); + mlir_aie_configure_switchboxes(xaie); + mlir_aie_initialize_locks(xaie); + mlir_aie_configure_dmas(xaie); + mlir_aie_start_cores(xaie); + + // Allocating some device memory + ext_mem_model_t buf0, buf1, buf2; + uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT); + uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT); + uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(xaie, buf2, DMA_COUNT); + mlir_aie_sync_mem_dev(buf0); + mlir_aie_sync_mem_dev(buf1); + mlir_aie_sync_mem_dev(buf2); + + if (in_a == nullptr || in_b == nullptr || out == nullptr) { + std::cout << "Could not allocate in device memory" << std::endl; + return -1; + } + + for (int i = 0; i < DMA_COUNT; i++) { + in_a[i] = i + 1; + in_b[i] = i; + out[i] = 0xdeface; + } + + // Pass arguments in the order of dma_memcpys in the mlir + invoke_data_movement(queues[0], &agents[0], out, in_a, in_b); + + int errors = 0; + + for (int i = 0; i < DMA_COUNT; i++) { + uint32_t s0 = in_a[i]; + uint32_t s1 = in_b[i]; + uint32_t d = out[i]; + printf("s0[%d] = 0x%x\n", i, s0); + printf("s1[%d] = 0x%x\n", i, s1); + printf("d[%d] = 0x%x\n", i, d); + if (d != (s0 * s1)) { + errors++; + printf("mismatch 0x%x != 0x%x + 0x%x\n", d, s0, s1); + } + } + + // destroying the queue + hsa_queue_destroy(queues[0]); + + // Shutdown AIR and HSA + mlir_aie_deinit_libxaie(xaie); + + if (!errors) { + printf("PASS!\n"); + return 0; + } else { + printf("fail %d/%d.\n", errors, DMA_COUNT); + return -1; + } +} diff --git a/programming_examples/basic/vvmul/CMakeLists.txt b/programming_examples/basic/vvmul/CMakeLists.txt new file mode 100644 index 0000000000..68cfdbd97f --- /dev/null +++ b/programming_examples/basic/vvmul/CMakeLists.txt @@ -0,0 +1,77 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC + DATASIZE=${DATASIZE} + DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/vvmul/Makefile b/programming_examples/basic/vvmul/Makefile new file mode 100755 index 0000000000..b9d833cafb --- /dev/null +++ b/programming_examples/basic/vvmul/Makefile @@ -0,0 +1,79 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../makefile-common + +VPATH := ${srcdir}/../../../aie_kernels/aie2 + +data_size = 4096 +targetname = vectorMult +devicename = npu +col = 0 + +all: build/final.xclbin + +build2/%.o: %.cc + mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F} + +build/aie.mlir: ${srcdir}/aie2.py + mkdir -p ${@D} + python3 $< ${devicename} ${col} ${data_size} > $@ + +build/final.xclbin: build/aie.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt ${ $@ + +build2/final.xclbin: build2/aie.mlir build2/scale.o + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + + +# Changing variables when we target VCK5000 +vck5000: devicename=xcvc1902 +vck5000: col=6 + +vck5000: build/aie.mlir + aiecc.py --xchesscc --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \ + -I/opt/xaiengine/include \ + -I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \ + -L/opt/xaiengine/lib \ + -L/lib/x86_64-linux-gnu/ \ + ${srcdir}/test_vck5000.cpp \ + ${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \ + -Wl,-R/opt/xaiengine/lib \ + -Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf + + +run: ${targetname}.exe build/final.xclbin build/insts.txt build2/final.xclbin + ${powershell} ./$< --xclbin build/final.xclbin --xclbin_2 build2/final.xclbin --instr build/insts.txt --instr_2 build2/insts.txt -k MLIR_AIE + +clean: + rm -rf build build2 _build inst aie.mlir.prj core_* test.elf ${targetname}.exe build2/aie.mlir build2/final.xclbin diff --git a/programming_examples/basic/vvmul/README.md b/programming_examples/basic/vvmul/README.md new file mode 100644 index 0000000000..56b0a6cde0 --- /dev/null +++ b/programming_examples/basic/vvmul/README.md @@ -0,0 +1,57 @@ + + +# Vector Vector Multiply + +A simple binary operator, which uses a single AIE core to multiply two vectors together. The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`. It shows how simple it can be to just feed data into the AIEs using the ObjectFIFO abstraction, and drain the results back to external memory. This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. + +The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory. + +## Source Files Overview + +1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). + +1. `test.cpp`: This C++ code is a testbench for the design example targetting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results. + +1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targetting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results. + +## Ryzen™ AI Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make +make vectorAdd.exe +``` + +To run the design: + +``` +make run +``` + +## VCK5000 Usage + +### C++ Testbench + +To compile the design and C++ testbench: + +``` +make vck5000 +``` + +To run the design: + +``` +./test.elf +``` + diff --git a/programming_examples/basic/vvmul/aie2.py b/programming_examples/basic/vvmul/aie2.py new file mode 100644 index 0000000000..7cd85589ef --- /dev/null +++ b/programming_examples/basic/vvmul/aie2.py @@ -0,0 +1,89 @@ +# vector_vector_mul/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + +import sys + + +def my_vector_mul(N): + n = 16 + N_div_n = N // n + + buffer_depth = 2 + + if sys.argv[1] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_ty = T.memref(n, T.i32()) + + # AIE Core Function declarations + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty) + of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in for_(sys.maxsize): + # Number of sub-vector "tile" iterations + for _ in for_(N_div_n): + elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1) + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + for i in for_(n): + v0 = memref.load(elem_in1, [i]) + v1 = memref.load(elem_in2, [i]) + v2 = arith.muli(v0, v1) + memref.store(v2, elem_out, [i]) + yield_([]) + of_in1.release(ObjectFifoPort.Consume, 1) + of_in2.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) + + +with mlir_mod_ctx() as ctx: + my_vector_mul(int(sys.argv[3])) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/programming_examples/basic/vvmul/run_makefile.lit b/programming_examples/basic/vvmul/run_makefile.lit new file mode 100644 index 0000000000..6875524001 --- /dev/null +++ b/programming_examples/basic/vvmul/run_makefile.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/basic/vvmul/run_vck5000.lit b/programming_examples/basic/vvmul/run_vck5000.lit new file mode 100644 index 0000000000..dcaa9f99c5 --- /dev/null +++ b/programming_examples/basic/vvmul/run_vck5000.lit @@ -0,0 +1,8 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: hsa, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile vck5000 +// RUN: %run_on_vck5000 ./test.elf diff --git a/programming_examples/basic/vvmul/sweep.sh b/programming_examples/basic/vvmul/sweep.sh new file mode 100644 index 0000000000..579a545205 --- /dev/null +++ b/programming_examples/basic/vvmul/sweep.sh @@ -0,0 +1,9 @@ +M_lo=4096 +M_step=4096 +M_hi=12288 + +Ms=$(seq $M_lo $M_step $M_hi) + +for M in $Ms; do + make clean && make run data_size=$M +done \ No newline at end of file diff --git a/programming_examples/basic/vvmul/test.cpp b/programming_examples/basic/vvmul/test.cpp new file mode 100644 index 0000000000..25e8c53295 --- /dev/null +++ b/programming_examples/basic/vvmul/test.cpp @@ -0,0 +1,199 @@ + +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +namespace po = boost::program_options; +using TY_2 = std::uint32_t; +const int scaleFactor = 3; + +int main(int argc, const char *argv[]) { + + // ------------------------------------------------------ + // Parse program arguments + // ------------------------------------------------------ + po::options_description desc("Allowed options"); + po::variables_map vm; + desc.add_options()("help,h", "produce help message")( + "xclbin", po::value()->required(), + "the input xclbin path2")( + "xclbin_2", po::value()->required(), + "the input xclbin path2")( + "kernel_2,k", po::value()->required(), + "the kernel_2 name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr", po::value()->required(), + "path of file containing userspace instructions sent to the NPU")( + "instr_2", po::value()->required(), + "path of file containing userspace instructions sent to the NPU")( + "verify", po::value()->default_value(true), + "whether to verify the AIE computed output")( + "iters", po::value()->default_value(1))( + "warmup", po::value()->default_value(0))( + "trace_sz,t", po::value()->default_value(0))( + "trace_file", po::value()->default_value("trace.txt"), + "where to store trace output"); + + test_utils::parse_options(argc, argv, desc, vm); + int verbosity = vm["verbosity"].as(); + int do_verify = vm["verify"].as(); + int n_iterations = vm["iters"].as(); + int n_warmup_iterations = vm["warmup"].as(); + int trace_size = vm["trace_sz"].as(); + + constexpr int IN_SIZE_1 = DATASIZE; + constexpr int OUT_SIZE_1 = IN_SIZE_1; + + constexpr int IN_VOLUME_2 = DATASIZE; + constexpr int OUT_VOLUME_2 = IN_VOLUME_2; + int IN_SIZE_2 = IN_VOLUME_2 * sizeof(TY_2); + int OUT_SIZE_2 = OUT_VOLUME_2 * sizeof(TY_2) + trace_size; + + // ------------------------------------------------------ + // Get device, load the xclbin & kernel_2 and register them + // ------------------------------------------------------ + // Get a device handle + unsigned int device_index = 0; + auto kernelName = "MLIR_AIE"; + auto device = xrt::device(device_index); + + std::vector instr_v1 = test_utils::load_instr_sequence(vm["instr"].as()); + auto xclbin_1 = xrt::xclbin(vm["xclbin"].as()); // Load the xclbin + device.register_xclbin(xclbin_1); // Register xclbin + xrt::hw_context context_1(device, xclbin_1.get_uuid()); // Get a hardware context + auto kernel_1 = xrt::kernel(context_1, kernelName); // Get a kernel_2 handle: MLIR_AIE + + std::vector instr_v2 = test_utils::load_instr_sequence(vm["instr_2"].as()); + auto xclbin_2 = xrt::xclbin(vm["xclbin_2"].as()); // Load the xclbin + device.register_xclbin(xclbin_2); // Register xclbin + xrt::hw_context context_2(device, xclbin_2.get_uuid()); // Get a hardware context + auto kernel_2 = xrt::kernel(context_2, kernelName); // Get a kernel_2 handle: MLIR_AIE + + + // ------------------------------------------------------ + // Initialize input/ output buffer sizes and sync them + // ------------------------------------------------------ + + auto bo_instr_1 = xrt::bo(device, instr_v1.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_1.group_id(1)); + void *bufInstr_1 = bo_instr_1.map(); + memcpy(bufInstr_1, instr_v1.data(), instr_v1.size() * sizeof(int)); + + auto bo_inA_1 = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(3)); + uint32_t *bufInA_1 = bo_inA_1.map(); + std::vector srcVecA_1; + for (int i = 0; i < IN_SIZE_1; i++) + srcVecA_1.push_back(i + 1); + memcpy(bufInA_1, srcVecA_1.data(), (srcVecA_1.size() * sizeof(uint32_t))); + + auto bo_inB_1 = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(4)); + uint32_t *bufInB_1 = bo_inB_1.map(); + std::vector srcVecB_1; + for (int i = 0; i < IN_SIZE_1; i++) + srcVecB_1.push_back(i); + memcpy(bufInB_1, srcVecB_1.data(), (srcVecB_1.size() * sizeof(uint32_t))); + + auto bo_out_1 = xrt::bo(device, OUT_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(5)); + + + auto bo_instr_2 = xrt::bo(device, instr_v2.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_2.group_id(1)); + void *bufInstr_2 = bo_instr_2.map(); + memcpy(bufInstr_2, instr_v2.data(), instr_v2.size() * sizeof(int)); + + auto bo_inA_2 = xrt::bo(device, IN_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(3)); + TY_2 *bufInA_2 = bo_inA_2.map(); + for (int i = 0; i < IN_VOLUME_2; i++) + bufInA_2[i] = i + 1; + + auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(4)); + int32_t *bufInFactor = bo_inFactor.map(); + *bufInFactor = (TY_2)scaleFactor; + + auto bo_outC_2 = xrt::bo(device, OUT_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(5)); + + + bo_instr_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB_1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + bo_instr_2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA_2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // RUN KERNEL + + unsigned int opcode = 3; + + std::ofstream f_time; + std::string file_name = "time_" + std::to_string(DATASIZE) + ".txt"; + f_time.open(file_name); + for (int i=1; i<=1000; i++) { + auto start = std::chrono::high_resolution_clock::now(); + + auto run1 = kernel_1(opcode, bo_instr_1, instr_v1.size(), bo_inA_1, bo_inB_1, bo_out_1); + run1.wait(); + auto run2 = kernel_2(opcode, bo_instr_2, instr_v2.size(), bo_inA_2, bo_inFactor, bo_outC_2); + run2.wait(); + + auto stop = std::chrono::high_resolution_clock::now(); + float npu_time = std::chrono::duration_cast(stop - start).count(); + if (i<11) + std::cout << i << " " << IN_SIZE_1 << " NPU time: " << npu_time << "us." << std::endl; + f_time << npu_time << "\n"; + } + f_time.close(); + + bo_out_1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + uint32_t *bufOut_1 = bo_out_1.map(); + + bo_outC_2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + TY_2 *bufOut_2 = bo_outC_2.map(); + + // COMPARE + + int errors = 0; + + for (uint32_t i = 0; i < OUT_SIZE_1; i++) { + if (*(bufOut_1 + i) != *(bufInA_1 + i) * *(bufInB_1 + i)) { + std::cout << "Error in output " << *(bufOut_1 + i) + << " != " << *(bufInA_1 + i) << " * " << *(bufInB_1 + i) + << std::endl; + errors++; + } else { + if (verbosity > 1) + std::cout << "Correct output " << *(bufOut_1 + i) + << " == " << *(bufInA_1 + i) * *(bufInB_1 + i) << std::endl; + } + } + + for (uint32_t i = 0; i < IN_VOLUME_2; i++) { + int32_t ref = bufInA_2[i] * scaleFactor; + int32_t test = bufOut_2[i]; + if (test != ref) { + if (verbosity >= 1) + std::cout << "Error in output " << test << " != " << ref << std::endl; + errors++; + } else { + if (verbosity >= 1) + std::cout << "Correct output " << test << " == " << ref << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed.\n\n"; + return 1; + } +} diff --git a/programming_examples/basic/vvmul/test_vck5000.cpp b/programming_examples/basic/vvmul/test_vck5000.cpp new file mode 100644 index 0000000000..ab2096bef9 --- /dev/null +++ b/programming_examples/basic/vvmul/test_vck5000.cpp @@ -0,0 +1,138 @@ +//===- test_vck5000.cpp -----------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory_allocator.h" +#include "test_library.h" + +#include "aie_data_movement.cpp" +#include "aie_inc.cpp" + +#include "hsa/hsa.h" +#include "hsa/hsa_ext_amd.h" + +constexpr int DMA_COUNT = 256; + +void hsa_check_status(const std::string func_name, hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char *status_string(new char[1024]); + hsa_status_string(status, &status_string); + std::cout << func_name << " failed: " << status_string << std::endl; + delete[] status_string; + } else { + std::cout << func_name << " success" << std::endl; + } +} + +int main(int argc, char *argv[]) { + uint64_t row = 0; + uint64_t col = 6; + + std::vector queues; + uint32_t aie_max_queue_size(0); + + aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie(); + + // This is going to initialize HSA, create a queue + // and get an agent + int ret = mlir_aie_init_device(xaie); + + if (ret) { + std::cout << "[ERROR] Error when calling mlir_aie_init_device)" + << std::endl; + return -1; + } + + // Getting access to all of the HSA agents + std::vector agents = xaie->agents; + + if (agents.empty()) { + std::cout << "No agents found. Exiting." << std::endl; + return -1; + } + + std::cout << "Found " << agents.size() << " agents" << std::endl; + + hsa_queue_t *q = xaie->cmd_queue; + + // Adding to our vector of queues + queues.push_back(q); + assert(queues.size() > 0 && "No queues were sucesfully created!"); + + mlir_aie_configure_cores(xaie); + mlir_aie_configure_switchboxes(xaie); + mlir_aie_initialize_locks(xaie); + mlir_aie_configure_dmas(xaie); + mlir_aie_start_cores(xaie); + + // Allocating some device memory + ext_mem_model_t buf0, buf1, buf2; + uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT); + uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT); + uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(xaie, buf2, DMA_COUNT); + mlir_aie_sync_mem_dev(buf0); + mlir_aie_sync_mem_dev(buf1); + mlir_aie_sync_mem_dev(buf2); + + if (in_a == nullptr || in_b == nullptr || out == nullptr) { + std::cout << "Could not allocate in device memory" << std::endl; + return -1; + } + + for (int i = 0; i < DMA_COUNT; i++) { + in_a[i] = i + 1; + in_b[i] = i; + out[i] = 0xdeface; + } + + // Pass arguments in the order of dma_memcpys in the mlir + invoke_data_movement(queues[0], &agents[0], out, in_a, in_b); + + int errors = 0; + + for (int i = 0; i < DMA_COUNT; i++) { + uint32_t s0 = in_a[i]; + uint32_t s1 = in_b[i]; + uint32_t d = out[i]; + printf("s0[%d] = 0x%x\n", i, s0); + printf("s1[%d] = 0x%x\n", i, s1); + printf("d[%d] = 0x%x\n", i, d); + if (d != (s0 * s1)) { + errors++; + printf("mismatch 0x%x != 0x%x + 0x%x\n", d, s0, s1); + } + } + + // destroying the queue + hsa_queue_destroy(queues[0]); + + // Shutdown AIR and HSA + mlir_aie_deinit_libxaie(xaie); + + if (!errors) { + printf("PASS!\n"); + return 0; + } else { + printf("fail %d/%d.\n", errors, DMA_COUNT); + return -1; + } +} diff --git a/test/npu-xrt/add_one_two/CMakeLists.txt b/test/npu-xrt/add_one_two/CMakeLists.txt new file mode 100644 index 0000000000..8aedc5111f --- /dev/null +++ b/test/npu-xrt/add_one_two/CMakeLists.txt @@ -0,0 +1,71 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Xilinx Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ../../utils + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() \ No newline at end of file diff --git a/test/npu-xrt/add_one_two/Makefile b/test/npu-xrt/add_one_two/Makefile new file mode 100644 index 0000000000..1829431591 --- /dev/null +++ b/test/npu-xrt/add_one_two/Makefile @@ -0,0 +1,31 @@ +include ../makefile-common + +targetname = addOneObjfifo + +all: build/final.xclbin build/insts.txt + +build/one.xclbin: aie1.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +build/two.xclbin: aie2.mlir build/one.xclbin + cd ${@D} && aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-input=one.xclbin --xclbin-name=${@F} \ + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/two.xclbin build/insts.txt + ${powershell} ./$< -x build/two.xclbin -i build/insts.txt + +clean: + rm -rf build _build ${targetname}.exe \ No newline at end of file