diff --git a/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe b/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe
new file mode 100755
index 0000000000..d39f703905
Binary files /dev/null and b/programming_examples/basic/matrix_multiplication/whole_array/matrixMultiplication.exe differ
diff --git a/programming_examples/basic/passthrough_dmas/test.cpp b/programming_examples/basic/passthrough_dmas/test.cpp
index 9c11596119..63248d0959 100644
--- a/programming_examples/basic/passthrough_dmas/test.cpp
+++ b/programming_examples/basic/passthrough_dmas/test.cpp
@@ -166,8 +166,22 @@ int main(int argc, const char *argv[]) {
   if (verbosity >= 1)
     std::cout << "Running Kernel." << std::endl;
   unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+
+  std::ofstream f_time;
+  f_time.open("time.txt");
+  for (int i=1; i<=1000; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+    run.wait();
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+    if (i<11)
+      std::cout << i << " " << srcVecA.size() << " NPU time: " << npu_time << "us." << std::endl;
+    f_time << npu_time << "\n";
+  }
+  f_time.close();
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/programming_examples/basic/passthrough_dmas/time.txt b/programming_examples/basic/passthrough_dmas/time.txt
new file mode 100644
index 0000000000..22b154f9ae
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas/time.txt
@@ -0,0 +1,1000 @@
+2148
+331
+335
+253
+227
+154
+148
+148
+150
+150
+139
+128
+121
+127
+123
+123
+123
+123
+125
+131
+130
+131
+130
+131
+157
+136
+159
+163
+169
+164
+164
+162
+143
+133
+120
+120
+121
+120
+120
+117
+128
+127
+127
+126
+126
+112
+126
+113
+126
+126
+133
+126
+125
+130
+126
+124
+127
+127
+125
+108
+126
+128
+127
+112
+115
+109
+114
+114
+113
+115
+115
+115
+106
+121
+121
+121
+120
+119
+120
+119
+111
+122
+122
+121
+121
+122
+122
+122
+108
+188
+209
+152
+122
+121
+122
+113
+106
+109
+110
+109
+110
+114
+110
+111
+111
+122
+122
+122
+121
+119
+120
+120
+108
+121
+122
+120
+120
+120
+120
+120
+120
+118
+198
+179
+116
+121
+120
+108
+147
+199
+179
+106
+106
+106
+106
+105
+121
+122
+120
+119
+119
+120
+119
+119
+120
+120
+119
+119
+119
+119
+119
+111
+125
+121
+120
+118
+125
+121
+121
+119
+112
+120
+120
+119
+119
+119
+119
+119
+107
+120
+121
+121
+121
+120
+119
+119
+123
+110
+119
+120
+120
+119
+119
+119
+119
+105
+105
+105
+106
+108
+114
+105
+105
+105
+105
+121
+121
+121
+120
+118
+121
+120
+120
+124
+196
+191
+120
+120
+120
+108
+104
+105
+105
+105
+105
+105
+105
+104
+106
+107
+121
+122
+118
+102
+119
+120
+119
+117
+110
+120
+120
+120
+119
+120
+121
+119
+155
+199
+191
+119
+119
+119
+107
+121
+121
+121
+120
+119
+121
+120
+121
+113
+120
+119
+124
+102
+121
+120
+120
+106
+105
+106
+106
+108
+104
+104
+106
+105
+104
+120
+120
+120
+120
+118
+119
+119
+119
+112
+119
+119
+120
+119
+119
+120
+119
+107
+121
+121
+119
+114
+121
+120
+119
+123
+110
+121
+120
+119
+119
+118
+119
+103
+194
+203
+183
+114
+110
+110
+122
+120
+119
+119
+119
+119
+119
+119
+118
+111
+109
+113
+110
+103
+120
+119
+119
+119
+119
+121
+119
+119
+119
+120
+119
+119
+105
+121
+120
+121
+121
+119
+119
+119
+112
+117
+198
+190
+120
+120
+120
+108
+105
+105
+106
+104
+98
+110
+114
+108
+122
+120
+121
+121
+119
+121
+120
+121
+119
+112
+120
+120
+119
+119
+119
+120
+119
+108
+105
+105
+104
+104
+105
+105
+104
+105
+105
+121
+121
+120
+124
+119
+121
+103
+121
+117
+121
+120
+119
+120
+120
+119
+119
+108
+122
+121
+121
+120
+119
+124
+119
+120
+135
+198
+191
+120
+120
+121
+108
+105
+106
+104
+104
+99
+110
+110
+109
+119
+121
+121
+121
+119
+119
+121
+120
+120
+113
+110
+110
+119
+119
+119
+119
+119
+119
+119
+119
+119
+119
+119
+119
+119
+111
+121
+121
+120
+124
+123
+121
+122
+124
+111
+110
+110
+110
+106
+105
+110
+110
+110
+120
+119
+119
+119
+119
+119
+119
+119
+107
+121
+121
+120
+120
+121
+122
+121
+120
+129
+199
+173
+121
+119
+104
+108
+165
+200
+180
+107
+106
+106
+105
+124
+121
+121
+120
+119
+119
+119
+119
+113
+119
+123
+120
+120
+119
+119
+119
+106
+121
+121
+119
+125
+120
+121
+121
+119
+121
+120
+120
+120
+120
+118
+119
+119
+105
+106
+105
+106
+105
+105
+104
+104
+105
+105
+136
+198
+192
+121
+121
+121
+108
+129
+199
+196
+106
+105
+104
+105
+105
+105
+104
+103
+110
+106
+105
+105
+105
+105
+110
+197
+191
+121
+121
+121
+119
+119
+119
+122
+119
+119
+120
+120
+121
+122
+119
+119
+128
+120
+128
+121
+100
+97
+98
+97
+113
+114
+114
+116
+114
+109
+120
+124
+127
+121
+123
+121
+123
+121
+117
+114
+114
+113
+114
+114
+111
+115
+99
+105
+122
+122
+119
+107
+120
+121
+122
+112
+123
+122
+122
+122
+123
+126
+122
+117
+122
+123
+122
+121
+120
+121
+103
+118
+114
+114
+117
+115
+114
+115
+114
+115
+114
+125
+199
+172
+114
+114
+114
+118
+120
+122
+122
+121
+120
+120
+126
+121
+176
+198
+187
+122
+122
+122
+121
+121
+121
+120
+120
+120
+120
+123
+117
+143
+199
+183
+122
+121
+121
+117
+114
+113
+114
+113
+111
+110
+114
+111
+116
+199
+198
+130
+122
+121
+121
+110
+110
+110
+111
+110
+111
+110
+110
+109
+110
+122
+121
+125
+121
+121
+120
+120
+112
+123
+122
+123
+122
+121
+122
+121
+113
+122
+122
+122
+121
+120
+121
+121
+113
+157
+199
+193
+122
+122
+122
+113
+110
+114
+110
+109
+111
+110
+110
+109
+110
+123
+122
+122
+121
+120
+122
+122
+116
+122
+122
+122
+121
+120
+125
+120
+112
+156
+199
+193
+122
+122
+121
+112
+109
+110
+109
+112
+114
+112
+115
+114
+115
+199
+198
+127
+122
+123
+121
+109
+114
+110
+111
+110
+110
+110
+110
+110
+121
+121
+122
+121
+120
+122
+122
+122
+112
+123
+122
+119
+102
+121
+121
+121
+117
+118
+114
+114
+114
+109
+110
+115
+111
+111
+122
+121
+122
+121
+120
+120
+120
+112
+123
+122
+121
+121
+122
+122
+122
+112
+162
+199
+181
+122
+120
+103
+118
+115
+114
+113
+113
+109
+110
+110
+110
+110
+123
+122
+122
+121
+120
+120
+121
+112
+155
+199
+197
+122
+122
+121
+111
+110
+109
+110
+148
+156
+161
+161
+169
+171
+169
+168
+172
+152
+144
+127
+116
+117
+114
+115
+113
+115
+124
+123
+123
+122
+123
+124
+123
+112
+122
+121
+121
+110
+122
+121
+120
+112
+164
+200
+193
+123
+123
+122
+113
+110
+111
+115
+110
+109
+110
+110
+110
+110
+122
+122
+122
+121
+120
+122
+121
+117
+122
+122
+121
+123
+121
+121
+121
+112
+122
+122
+122
+121
+120
+122
+121
+120
+177
+197
+193
+122
+122
+122
+112
+110
+110
+110
+110
+109
+110
+110
+110
+110
+123
+121
+120
+108
+126
+120
+121
+116
+123
+123
+122
+121
+121
+123
+121
+117
+122
+122
+122
+121
+120
+121
+121
+120
+114
+114
+123
+122
+123
+122
+121
+123
+114
+115
+115
+113
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 11f2824a42..07e9a1d376 100644
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -14,7 +14,9 @@ include ${srcdir}/../../makefile-common
 
 targetname = passThroughKernel
 VPATH := ${srcdir}/../../../aie_kernels/generic
-data_size = 4096
+data_size = 20480
+
+
 trace_size = 8192
 PASSTHROUGH_SIZE = ${data_size}
 
diff --git a/programming_examples/basic/passthrough_kernel/test.cpp b/programming_examples/basic/passthrough_kernel/test.cpp
index f28691abcd..127149b84a 100644
--- a/programming_examples/basic/passthrough_kernel/test.cpp
+++ b/programming_examples/basic/passthrough_kernel/test.cpp
@@ -88,8 +88,22 @@ int main(int argc, const char *argv[]) {
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out);
-  run.wait();
+
+  std::ofstream f_time;
+  f_time.open("time.txt");
+  for (int i=1; i<=1000; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out);
+    run.wait();
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+    if (i<11)
+      std::cout << i << " " << PASSTHROUGH_SIZE << " NPU time: " << npu_time << "us." << std::endl;
+    f_time << npu_time << "\n";
+  }
+  f_time.close();
 
   // Sync device to host memories
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index 814f8c7a6a..5751a26a6c 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -81,8 +81,22 @@ def main(opts):
     if opts.verbosity >= 1:
         print("Running Kernel.")
     opcode = 3
-    h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1)
-    h.wait()
+
+    with open("time_py.txt", "w") as f:
+
+        for i in range(1000):
+            start = time.time_ns()
+
+            h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1)
+            h.wait()
+
+            stop = time.time_ns()
+            npu_time_ms = round((stop - start)/1000)
+            f.write(f"{npu_time_ms}\n")
+            print(f"{i}: {opts.size} - {npu_time_ms}")
+        
+
+
     bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
 
     # Copy output results and verify they are correct
diff --git a/programming_examples/basic/passthrough_kernel/time.txt b/programming_examples/basic/passthrough_kernel/time.txt
new file mode 100644
index 0000000000..e2f3556454
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/time.txt
@@ -0,0 +1,1000 @@
+353
+242
+207
+176
+149
+147
+149
+146
+132
+132
+131
+135
+136
+118
+134
+135
+135
+135
+134
+135
+136
+135
+134
+134
+135
+135
+134
+134
+136
+135
+134
+133
+133
+131
+128
+128
+128
+128
+131
+128
+128
+131
+137
+124
+137
+132
+132
+132
+132
+132
+138
+132
+131
+136
+136
+134
+133
+135
+135
+135
+133
+133
+136
+134
+134
+134
+134
+133
+132
+134
+132
+127
+124
+122
+132
+126
+125
+125
+125
+124
+126
+126
+126
+124
+125
+126
+125
+125
+125
+125
+125
+124
+124
+125
+124
+125
+125
+125
+125
+126
+126
+131
+189
+204
+197
+126
+127
+127
+124
+125
+126
+126
+126
+126
+125
+124
+125
+125
+124
+125
+125
+125
+125
+125
+124
+124
+125
+124
+125
+125
+124
+125
+125
+124
+107
+125
+126
+126
+126
+126
+125
+126
+126
+125
+126
+125
+125
+125
+125
+124
+124
+124
+131
+124
+124
+124
+158
+200
+197
+126
+126
+126
+125
+124
+130
+127
+126
+129
+126
+124
+124
+126
+126
+126
+125
+125
+125
+125
+124
+124
+125
+128
+112
+125
+125
+125
+125
+125
+125
+124
+126
+126
+125
+124
+124
+124
+110
+127
+127
+126
+125
+129
+127
+129
+126
+125
+126
+126
+124
+126
+125
+126
+127
+125
+126
+125
+125
+125
+124
+124
+125
+126
+126
+125
+124
+127
+126
+131
+127
+125
+126
+125
+126
+126
+125
+127
+125
+125
+125
+125
+125
+189
+208
+197
+125
+126
+124
+125
+124
+124
+125
+125
+125
+125
+129
+125
+125
+117
+127
+125
+125
+125
+127
+190
+207
+197
+126
+126
+125
+125
+125
+125
+125
+125
+125
+126
+125
+125
+125
+125
+126
+126
+127
+125
+125
+124
+125
+108
+130
+126
+126
+126
+127
+125
+126
+126
+125
+126
+125
+125
+125
+167
+200
+197
+125
+129
+125
+125
+125
+124
+126
+126
+126
+124
+124
+107
+124
+127
+126
+126
+126
+125
+125
+203
+201
+198
+126
+126
+125
+125
+125
+124
+125
+107
+125
+134
+130
+126
+126
+125
+125
+126
+127
+126
+127
+126
+125
+109
+129
+125
+126
+126
+126
+125
+126
+125
+129
+125
+126
+125
+125
+166
+200
+198
+126
+126
+127
+125
+126
+125
+126
+126
+127
+125
+125
+126
+125
+108
+130
+126
+126
+125
+126
+126
+126
+126
+125
+126
+126
+125
+125
+177
+197
+198
+110
+112
+125
+126
+125
+124
+126
+127
+126
+126
+125
+125
+125
+132
+200
+183
+183
+182
+146
+142
+141
+137
+138
+139
+131
+132
+132
+132
+131
+130
+130
+131
+125
+125
+125
+126
+126
+126
+125
+125
+132
+131
+126
+132
+131
+132
+131
+126
+132
+131
+131
+130
+129
+131
+132
+126
+126
+126
+126
+125
+124
+125
+125
+132
+139
+132
+131
+130
+132
+132
+128
+131
+118
+136
+132
+132
+131
+131
+127
+126
+126
+126
+126
+126
+126
+126
+131
+132
+132
+131
+130
+130
+131
+127
+131
+131
+132
+131
+130
+131
+131
+132
+129
+135
+132
+132
+132
+132
+128
+131
+132
+132
+119
+141
+132
+136
+126
+125
+126
+126
+126
+108
+126
+126
+126
+126
+125
+125
+126
+126
+126
+125
+125
+125
+109
+125
+125
+125
+126
+125
+125
+125
+125
+125
+125
+126
+126
+126
+125
+108
+126
+126
+109
+108
+108
+108
+128
+130
+126
+126
+126
+126
+128
+125
+184
+198
+203
+126
+127
+126
+126
+125
+127
+127
+126
+126
+128
+126
+126
+126
+125
+125
+125
+125
+126
+126
+168
+200
+198
+127
+126
+127
+126
+125
+109
+127
+126
+125
+127
+125
+126
+130
+126
+127
+126
+125
+125
+126
+127
+126
+108
+126
+127
+125
+126
+125
+126
+130
+126
+125
+125
+125
+126
+126
+127
+124
+109
+126
+127
+126
+126
+126
+125
+126
+126
+125
+126
+126
+126
+126
+127
+126
+125
+125
+126
+125
+125
+125
+125
+126
+126
+125
+125
+125
+125
+125
+125
+127
+127
+127
+126
+126
+117
+130
+139
+133
+133
+131
+138
+131
+125
+130
+129
+129
+130
+127
+127
+127
+124
+123
+123
+124
+124
+124
+123
+123
+125
+133
+136
+133
+132
+131
+134
+128
+131
+130
+130
+130
+128
+129
+129
+126
+123
+124
+124
+124
+123
+124
+124
+126
+133
+137
+132
+136
+133
+133
+128
+130
+128
+131
+130
+129
+128
+128
+127
+132
+133
+133
+133
+131
+133
+131
+123
+126
+129
+124
+124
+124
+123
+125
+130
+132
+128
+128
+130
+130
+130
+126
+132
+134
+134
+134
+133
+132
+129
+126
+124
+124
+123
+124
+123
+123
+124
+131
+130
+131
+130
+129
+129
+129
+129
+123
+123
+124
+124
+124
+123
+123
+125
+133
+129
+121
+121
+109
+126
+125
+125
+123
+120
+125
+121
+108
+175
+178
+188
+145
+140
+140
+140
+135
+130
+111
+109
+128
+129
+129
+130
+129
+121
+120
+119
+121
+121
+121
+120
+120
+120
+130
+131
+130
+130
+135
+131
+131
+128
+129
+130
+134
+129
+129
+128
+127
+120
+120
+120
+120
+120
+121
+120
+120
+121
+131
+134
+131
+129
+129
+131
+128
+129
+130
+129
+129
+129
+128
+129
+121
+131
+131
+130
+130
+129
+132
+129
+122
+121
+121
+126
+126
+126
+124
+125
+126
+130
+110
+129
+128
+128
+128
+128
+127
+127
+129
+128
+128
+128
+128
+121
+132
+131
+131
+131
+131
+130
+129
+126
+123
+123
+126
+125
+125
+125
+124
+124
+272
+222
+130
+129
+129
+121
+120
+121
+121
+121
+120
+120
+121
+121
+131
+131
+131
+130
+129
+129
+133
+130
+128
+128
+128
+128
+128
+128
+121
+131
+138
+131
+130
+129
+131
+112
+130
+126
+126
+121
+121
+121
+121
+122
+125
+125
+125
+125
+125
+125
+124
+125
+125
+125
+125
+127
+127
+126
+126
+125
+125
+125
+112
+128
+126
+125
+124
+128
+126
+126
+125
+129
+126
+126
+126
+125
+186
+201
+196
+126
+126
+126
+124
+125
+125
+125
+124
+124
+125
+126
+126
+126
+117
+126
+126
+126
+126
+126
+126
+125
+124
+127
+126
diff --git a/programming_examples/basic/passthrough_kernel/time_py.txt b/programming_examples/basic/passthrough_kernel/time_py.txt
new file mode 100644
index 0000000000..3c0646f78b
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/time_py.txt
@@ -0,0 +1,1000 @@
+416
+285
+279
+203
+193
+173
+142
+139
+139
+137
+137
+137
+137
+142
+197
+199
+142
+145
+151
+152
+150
+151
+151
+150
+150
+125
+142
+140
+140
+140
+151
+140
+138
+136
+135
+136
+140
+141
+135
+135
+139
+139
+139
+138
+137
+137
+134
+130
+134
+133
+140
+131
+130
+133
+123
+134
+140
+140
+138
+138
+114
+144
+132
+133
+125
+122
+111
+179
+193
+155
+154
+120
+131
+127
+126
+123
+126
+123
+123
+124
+126
+126
+126
+127
+141
+134
+132
+132
+133
+134
+130
+128
+128
+128
+127
+128
+127
+128
+128
+134
+138
+147
+135
+132
+132
+127
+134
+134
+135
+135
+134
+133
+126
+117
+128
+130
+128
+128
+128
+127
+127
+127
+127
+127
+127
+128
+129
+127
+127
+133
+133
+134
+134
+132
+132
+129
+128
+133
+139
+128
+128
+127
+127
+122
+135
+134
+134
+136
+130
+133
+125
+134
+141
+135
+134
+132
+134
+129
+128
+127
+130
+127
+127
+127
+124
+128
+132
+133
+132
+132
+132
+132
+139
+146
+139
+125
+134
+135
+133
+132
+128
+127
+127
+127
+117
+124
+123
+129
+132
+130
+131
+132
+131
+131
+131
+124
+134
+134
+134
+132
+132
+134
+112
+130
+122
+123
+123
+123
+123
+121
+123
+134
+140
+143
+132
+132
+133
+130
+132
+131
+131
+132
+122
+142
+128
+131
+134
+133
+133
+134
+133
+132
+135
+142
+132
+131
+131
+131
+131
+131
+123
+134
+135
+134
+134
+133
+134
+128
+129
+127
+134
+127
+124
+127
+129
+132
+132
+132
+132
+132
+131
+131
+111
+140
+133
+132
+132
+132
+132
+129
+126
+127
+127
+127
+125
+127
+126
+128
+127
+127
+132
+131
+131
+131
+132
+131
+131
+141
+129
+129
+132
+131
+111
+140
+134
+133
+131
+121
+145
+133
+127
+127
+127
+128
+124
+127
+127
+128
+132
+131
+131
+131
+131
+131
+131
+123
+133
+133
+134
+133
+132
+132
+111
+125
+123
+140
+129
+136
+137
+189
+273
+257
+263
+208
+199
+195
+193
+193
+195
+195
+168
+142
+134
+140
+134
+137
+135
+136
+135
+133
+137
+142
+131
+130
+129
+128
+133
+128
+128
+134
+133
+115
+137
+132
+132
+138
+132
+132
+131
+132
+132
+132
+133
+126
+128
+129
+137
+148
+148
+147
+141
+127
+127
+127
+127
+127
+120
+123
+132
+131
+131
+131
+131
+131
+131
+123
+122
+110
+138
+129
+129
+126
+125
+111
+130
+129
+124
+129
+124
+125
+124
+128
+127
+134
+133
+144
+133
+132
+131
+131
+131
+132
+132
+131
+132
+130
+136
+136
+135
+135
+133
+133
+133
+127
+145
+145
+143
+145
+145
+144
+122
+122
+126
+123
+122
+123
+123
+123
+134
+135
+135
+135
+133
+133
+134
+127
+133
+147
+136
+131
+131
+131
+112
+154
+142
+141
+142
+148
+141
+138
+128
+127
+127
+116
+127
+128
+127
+132
+134
+131
+131
+131
+131
+131
+124
+123
+123
+123
+123
+122
+123
+122
+112
+123
+124
+135
+136
+136
+134
+130
+130
+140
+128
+126
+127
+126
+127
+127
+131
+136
+132
+115
+132
+132
+130
+123
+122
+123
+122
+122
+123
+123
+125
+142
+135
+135
+133
+133
+135
+135
+129
+132
+132
+131
+130
+131
+131
+124
+122
+124
+140
+129
+126
+127
+128
+124
+129
+142
+123
+122
+123
+122
+122
+233
+147
+183
+149
+148
+129
+149
+167
+148
+144
+147
+145
+141
+128
+128
+127
+128
+128
+128
+132
+135
+141
+139
+135
+134
+135
+132
+143
+129
+118
+130
+128
+127
+127
+127
+127
+127
+127
+127
+127
+127
+127
+128
+136
+136
+136
+135
+134
+134
+130
+135
+133
+134
+136
+132
+132
+134
+146
+138
+132
+131
+132
+132
+131
+126
+134
+171
+268
+256
+184
+185
+184
+184
+193
+195
+165
+188
+194
+168
+127
+140
+132
+132
+132
+131
+132
+124
+123
+123
+132
+123
+122
+123
+122
+122
+134
+133
+134
+137
+131
+132
+130
+127
+128
+132
+132
+131
+131
+130
+116
+132
+132
+130
+131
+132
+131
+131
+145
+134
+135
+133
+132
+133
+132
+129
+132
+131
+138
+131
+131
+131
+124
+123
+123
+123
+123
+123
+123
+122
+109
+135
+134
+133
+133
+134
+134
+132
+128
+132
+132
+132
+131
+132
+131
+124
+123
+122
+123
+123
+124
+127
+123
+123
+134
+139
+134
+132
+133
+142
+129
+134
+132
+132
+137
+129
+128
+131
+123
+122
+123
+125
+122
+123
+122
+123
+123
+134
+134
+134
+132
+131
+131
+128
+127
+132
+132
+132
+131
+131
+138
+123
+123
+141
+128
+127
+126
+127
+130
+132
+186
+264
+267
+195
+194
+193
+193
+194
+188
+184
+185
+190
+159
+135
+128
+127
+123
+123
+126
+123
+123
+126
+124
+134
+124
+124
+125
+123
+123
+123
+126
+124
+119
+129
+123
+139
+133
+122
+122
+122
+122
+123
+122
+123
+115
+123
+123
+122
+123
+122
+122
+123
+122
+126
+122
+122
+122
+122
+123
+122
+122
+122
+129
+122
+122
+122
+122
+122
+122
+123
+109
+124
+128
+127
+124
+133
+129
+123
+127
+123
+124
+126
+123
+123
+123
+123
+126
+123
+123
+123
+123
+123
+123
+126
+123
+123
+126
+123
+126
+127
+125
+128
+133
+124
+127
+122
+122
+133
+124
+124
+123
+114
+124
+143
+130
+123
+123
+122
+122
+122
+122
+122
+123
+122
+109
+182
+198
+195
+145
+127
+128
+126
+127
+123
+124
+123
+260
+195
+155
+150
+153
+144
+133
+132
+131
+130
+135
+129
+129
+123
+127
+128
+126
+127
+124
+132
+124
+127
+132
+132
+132
+132
+132
+144
+114
+147
+146
+145
+146
+145
+144
+134
+127
+138
+135
+123
+122
+122
+130
+132
+131
+131
+115
+132
+129
+128
+131
+132
+134
+133
+133
+133
+133
+131
+136
+131
+131
+131
+132
+132
+130
+122
+134
+134
+134
+131
+138
+133
+128
+126
+128
+126
+127
+126
+124
+127
+127
+133
+132
+112
+118
+131
+131
+131
+137
+134
+136
+133
+134
+133
+133
+129
+128
+127
+128
+128
+128
+127
diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
index e969be08f3..4b43a14735 100644
--- a/programming_examples/basic/vector_scalar_mul/Makefile
+++ b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -18,57 +18,57 @@ targetname = vectorScalar
 data_size = 4096
 trace_size = 8192
 
-all: build/final_${data_size}.xclbin build/insts_${data_size}.txt
+all: build/final.xclbin build/insts.txt
 
-kristof: build/insts_${data_size}.txt
+kristof: build/insts.txt
 
 build/%.o: %.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
 
-build/aie_${data_size}.mlir: ${srcdir}/aie2.py
+build/aie.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
 	python3 $< ${data_size} 0 > $@
 
-build/aie_trace_${data_size}.mlir: aie2.py
+build/aie_trace.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< ${data_size} ${trace_size} > $@
 
-#build/insts_${data_size}.txt: build/final_${data_size}.xclbin
-build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
+#build/insts.txt: build/final.xclbin
+build/final.xclbin: build/aie.mlir build/scale.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
-build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o
+build/final_trace.xclbin: build/aie_trace.mlir build/scale.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
-${targetname}_${data_size}.exe: ${srcdir}/test.cpp
+${targetname}.exe: ${srcdir}/test.cpp
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}_${data_size} -DVECTORSCALARMUL_SIZE=${data_size}
+	cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DVECTORSCALARMUL_SIZE=${data_size}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
-	cp _build/${targetname}_${data_size}.exe $@
+	cp _build/${targetname}.exe $@
 else
-	cp _build/${targetname}_${data_size} $@ 
+	cp _build/${targetname} $@ 
 endif
 
-run: ${targetname}_${data_size}.exe build/final_${data_size}.xclbin build/insts_${data_size}.txt 
-	${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
 
-run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt
-	${powershell} python3 ${srcdir}/test.py -x build/final_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -s ${data_size}
+run_py: build/final.xclbin build/insts.txt
+	${powershell} python3 ${srcdir}/test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -s ${data_size}
 
-trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt 
-	${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
-	../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
+trace: ${targetname}.exe build/final_trace.xclbin build/insts.txt 
+	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size}
+	../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json
 
-trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
-	${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
-	../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
+trace_py: build/final_trace.xclbin build/insts.txt
+	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
+	../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json
 
 
 clean_trace:
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index dd02a1010c..8d5327b3e7 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -18,7 +18,7 @@
 
 def my_vector_scalar(vector_size, trace_size):
     N = vector_size
-    N_in_bytes = N * 2
+    N_in_bytes = N * 4
     N_div_n = 4  # chop input vector into 4 sub-vectors
     n = N // N_div_n
 
@@ -28,17 +28,17 @@ def my_vector_scalar(vector_size, trace_size):
 
     @device(AIEDevice.npu1_1col)
     def device_body():
-        memRef_ty = T.memref(n, T.i16())
+        memRef_ty = T.memref(n, T.i32())
         memRef_ty2 = T.memref(1, T.i32())
 
         # AIE Core Function declarations
 
         scale_scalar = external_func(
-            "vector_scalar_mul_int16_scalar",
+            "vector_scalar_mul_int32_scalar",
             inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
         )
         scale = external_func(
-            "vector_scalar_mul_int16_vector",
+            "vector_scalar_mul_int32_vector",
             inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
         )
 
@@ -80,7 +80,7 @@ def core_body():
                 yield_([])
 
         # To/from AIE-array data movement
-        tensor_ty = T.memref(N, T.i16())
+        tensor_ty = T.memref(N, T.i32())
         scalar_ty = T.memref(1, T.i32())
 
         @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp
index d4acb04292..0a454cc628 100644
--- a/programming_examples/basic/vector_scalar_mul/test.cpp
+++ b/programming_examples/basic/vector_scalar_mul/test.cpp
@@ -22,8 +22,8 @@
 // Configure this to match your buffer data type
 // ------------------------------------------------------
 // using DATATYPE = std::uint8_t;
-// using DATATYPE = std::uint32_t;
-using DATATYPE = std::uint16_t;
+using DATATYPE = std::uint32_t;
+// using DATATYPE = std::uint16_t;
 #endif
 
 const int scaleFactor = 3;
@@ -100,12 +100,16 @@ int main(int argc, const char *argv[]) {
   bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
   // Execute the kernel and wait to finish
+  auto start = std::chrono::high_resolution_clock::now();
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run =
       kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
   run.wait();
+  auto stop = std::chrono::high_resolution_clock::now();
+  float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+  std::cout << "NPU time: " << npu_time << "us." << std::endl;
 
   // Sync device to host memories
   bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
@@ -128,10 +132,10 @@ int main(int argc, const char *argv[]) {
     }
   }
 
-  if (trace_size > 0) {
-    test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size,
-                                vm["trace_file"].as<std::string>());
-  }
+  // if (trace_size > 0) {
+  //   test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size,
+  //                               vm["trace_file"].as<std::string>());
+  // }
 
   // Print Pass/Fail result of our test
   if (!errors) {
diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py
index 414d62fa26..b32341c0ba 100644
--- a/programming_examples/basic/vector_vector_mul/aie2.py
+++ b/programming_examples/basic/vector_vector_mul/aie2.py
@@ -18,7 +18,7 @@
 
 
 def my_vector_mul():
-    N = 256
+    N = 256*60
     n = 16
     N_div_n = N // n
 
diff --git a/programming_examples/basic/vector_vector_mul/test.cpp b/programming_examples/basic/vector_vector_mul/test.cpp
index 52af9beb06..ab43b12bdf 100644
--- a/programming_examples/basic/vector_vector_mul/test.cpp
+++ b/programming_examples/basic/vector_vector_mul/test.cpp
@@ -40,8 +40,8 @@ int main(int argc, const char *argv[]) {
   int n_warmup_iterations = vm["warmup"].as<int>();
   int trace_size = vm["trace_sz"].as<int>();
 
-  constexpr int IN_SIZE = 256;
-  constexpr int OUT_SIZE = 256;
+  constexpr int IN_SIZE = 256*60;
+  constexpr int OUT_SIZE = IN_SIZE;
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =
@@ -132,8 +132,20 @@ int main(int argc, const char *argv[]) {
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+
+  std::ofstream f_time;
+  f_time.open("time.txt");
+  for (int i=1; i<=1000; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+    float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+    if (i<11)
+      std::cout << i << " " << IN_SIZE << " NPU time: " << npu_time << "us." << std::endl;
+    f_time << npu_time << "\n";
+  }
+  f_time.close();
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir
new file mode 100644
index 0000000000..fe8c67d906
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/aie1.mlir
@@ -0,0 +1,17 @@
+//===- aie1.mlir -----------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+  }
+}
diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir
new file mode 100644
index 0000000000..2b982871c0
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/aie2.mlir
@@ -0,0 +1,47 @@
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    aie.objectfifo @in1(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @in2(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c9223372036854775807 = arith.constant 9223372036854775807 : index
+      %c1 = arith.constant 1 : index
+      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
+        %c0_0 = arith.constant 0 : index
+        %c256 = arith.constant 256 : index
+        %c1_1 = arith.constant 1 : index
+        scf.for %arg1 = %c0_0 to %c256 step %c1_1 {
+          %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+          %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+          %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
+          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+          %c0_2 = arith.constant 0 : index
+          %c16 = arith.constant 16 : index
+          %c1_3 = arith.constant 1 : index
+          scf.for %arg2 = %c0_2 to %c16 step %c1_3 {
+            %6 = memref.load %1[%arg2] : memref<16xi32>
+            %7 = memref.load %3[%arg2] : memref<16xi32>
+            %8 = arith.muli %6, %7 : i32
+            memref.store %8, %5[%arg2] : memref<16xi32>
+          }
+          aie.objectfifo.release @in1(Consume, 1)
+          aie.objectfifo.release @in2(Consume, 1)
+          aie.objectfifo.release @out(Produce, 1)
+        }
+      }
+      aie.end
+    }
+    aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<4096xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+  }
+}
+
diff --git a/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir b/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir
new file mode 100644
index 0000000000..575c684839
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/aie3.mlir
@@ -0,0 +1,436 @@
+module {
+  aie.device(npu1_1col) {
+    %tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
+    memref.global "private" constant @blockwrite_data : memref<9xi32> = dense<[1440, 0, 0, 0, 0, 0, 0, 0, 8]>
+    memref.global "private" constant @blockwrite_data_0 : memref<444xi32> = dense<"0x43100038C30100005500000800005500000C0000990731169501404000C00100010001000100010055F87FFCFFFF5976301F2F780000B20462000000000000004D768CDCF0DF01000100010001000100010019140010010001000100010001009908B1149501402000C00100010001000100010059763E1803204B24000000001501007000000100010001000100010015010028030099063018010001000100C00300880300000000000000000000000100010001000100010001001908001001000100010001000100010019000000950000680000010001000100010001003B288B6508900140030003204B0500D08EFF3B290B25E8473F403EFF0320CB4500404EFF3B110002C30100D07FFF3B11A0A2C201008098FF3B1160C2C20100C0A8FF3B11B81803000000B8FF1960FE0F19C2FE0F1966FF0F19E4FF0F1962FB0FC00300880300000080C2FA070000000077108022C2010070F90FE40F5500090907007F00000002447338000000000000150100B0020043C8FF2728060000010001000100370100000000000000000000150100B0020043C8FF2768060000010001000100370100000000000000000000150100B0020043C8FF27880600000100010001003701000000000000000000004D9638D928FF4D16390A762059BA030259B4030159BE030259B8030159BC030259B2030159B603029D8F7B4F744059B00301FBAFF50700BA03C8774059BA0301FBCFB50700B40308772059BC0302FB9FF30600B40388762059B60302FB8F630400B20308762059B20302FBDFFB0700A203C8772059A20302FBCFBB0700BA0388774059B80301FBAFF50600BA0388744059BA0301FB8F710600B40388762059B60302FB1FFF0700B00308762059B20302FBCFBD0700BE0308772059BE0302FB2F650700BC0348773C59BCE302BDAFF58674000100998F71160100150100C00200BDCFF1877600BDEF7107760003203B2102007600F7203B4202B0E30000000000C0030048630028108046F90700000000150100C0020043C8032048060000010001000100370100000000000000000000150100C0020043C80320A8060000010001000100370100000000000000000000150100B0020043C8FF2728060000010001000100370100000000000000000000150100B0020043C8FF2768060000010001000100370100000000000000000000150100B0020043C8FF278806000001000100010037010000000000000000000059163B184D963A09760059B8030159B2030159BC030059B6030159BE030059B4030159BA03009D8F39CF740059B00301FBEF730600B8038F770059B80301FBBFF30700B203CF762059BE0300FBAF730700B2034F770059B40301FB8FE70400B2030F762059B20300FBCFB90700A6038F772059A60300FBBFF70700B8030F772059BE0300FBAF750700B6034F750059BA0301FB8F710600B4038F762059B60300FBEFFD0400B0030F762059B20300FBCFFD0700BC030F772059BE0300FB5F670700BC034F773C59BCE30099AFF51601000100150100C00200BD8F71C674E0BDCFF18776E0BDEF710776E003203BC0020076E07F0076FC652754C1000000482000150100C0020043C8032048060000010001000100370100000000000000000000150100C0020043C80320A8060000010001000100370100000000000000000000198C2114010001000100010001001D0132884EFFBB90FF67FFFF1F083FFF010001000100010001001907B416BDC273864EFFBD98F3463EFF9948B51699A57316950140A800C85500080C070055808A0A07005580090B070001000100D9C2FA075962FB0759E4FF075966FF0759E8FE075960FE0759EAFD07596CFD0759EEFC07D976FC07D9FEFB0719180010010001000100010037010000000000C0FF07000019121210191800100100010001000100C0030088030000000000000000000000010001000100010001000100191210101918001001000100010001001900000059960B185500000C000059763E185500800B000043280B9E3F8C2F009501401003C01920033819ECFF0F9942FE0F19C0FE0F2F780000380040008046FF070000D98603070100010001000100010001001914001059768E1D01000100010001009968F115950140F002C001000100010001002F78000038004000000000000000D942FE0759EEFE0759ECFF07D97EFF070100010001001918001001000100010001007F0000007100000000E0FF070000BB10D01AC00100480001BB10C042C00100C850003B299B24AA173E4050005936061C59F6841C55206D0C00000100010079F6C1189920C41043288B98F7300700D98E0307D986FB07010001000100010001001954001001000100010019000000C00300280B8002000000000000000000198CE71459160A1801000100010001005916791A1918001043280B8C01212300010001000100BB8E0300000000000000">
+    memref.global "private" constant @blockwrite_data_1 : memref<6xi32> = dense<[5242896, 0, 0, 0, 0, 235159520]>
+    memref.global "private" constant @blockwrite_data_2 : memref<6xi32> = dense<[5505040, 0, 0, 0, 0, 100941792]>
+    memref.global "private" constant @blockwrite_data_3 : memref<6xi32> = dense<[4718608, 0, 0, 0, 0, 503611362]>
+    memref.global "private" constant @blockwrite_data_4 : memref<6xi32> = dense<[4980752, 0, 0, 0, 0, 369393634]>
+    memref.global "private" constant @blockwrite_data_5 : memref<6xi32> = dense<[4194320, 0, 0, 0, 0, 772055013]>
+    memref.global "private" constant @blockwrite_data_6 : memref<6xi32> = dense<[4456464, 0, 0, 0, 0, 637837285]>
+    aiex.runtime_sequence(%arg0: memref<1024xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 2][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 4][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 6][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 8][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 10][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 15][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 20][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 22][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 27][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 32][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 37][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 42][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 47][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 52][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 57][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 62][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 67][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 72][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 77][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 82][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 87][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 92][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 97][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 102][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 107][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 112][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 117][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 122][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 127][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 132][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 137][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 142][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 147][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 152][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 157][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 162][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 167][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 172][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 177][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 182][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 187][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 192][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 197][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 202][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 207][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 212][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 217][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 222][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 227][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 232][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 237][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 242][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 247][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 252][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 257][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 262][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 267][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 272][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 277][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 282][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 287][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 292][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 297][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 302][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 307][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 312][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 317][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 322][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 327][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 332][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 337][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 342][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 347][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 352][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 357][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 362][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 367][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 372][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 377][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 382][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 387][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 392][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 397][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 402][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 407][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 412][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 417][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 422][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 427][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 432][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 437][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 442][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 447][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 452][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 457][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 462][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 467][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 472][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 477][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 482][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 487][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 492][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 497][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 502][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 507][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 512][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 517][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 522][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 527][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 532][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 537][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 542][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 547][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 552][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 557][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 562][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 567][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 572][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 577][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 579][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 581][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 583][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 585][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 587][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 589][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 591][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 593][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 595][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 597][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 599][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 601][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 603][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 605][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 607][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 609][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 611][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 613][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 615][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 617][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 619][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 621][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 623][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 625][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 627][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 629][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 631][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 633][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 635][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 637][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 639][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 641][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 643][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 645][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 650][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 653][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 658][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 661][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 666][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 669][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 674][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 677][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 682][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 685][1, 1, 1, 5][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 690][1, 1, 1, 3][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 693][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 695][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 697][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 699][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 701][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 703][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 705][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 707][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 709][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 711][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 713][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 715][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 717][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 719][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 721][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 723][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 725][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 727][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 729][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 731][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 733][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 735][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 737][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 26>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 739][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 741][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 743][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 745][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 747][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 749][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 751][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 753][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 755][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 757][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 759][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 15>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 761][1, 1, 1, 2][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 27>) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref<1024xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+    aie.packet_flow(15) {
+      aie.packet_source<%tile_0_0, Ctrl : 0>
+      aie.packet_dest<%tile_0_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(26) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_1, Ctrl : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.shim_dma_allocation @ctrlpkt_col0_mm2s_chan0(MM2S, 0, 0)
+    memref.global "public" @ctrlpkt_col0_mm2s_chan0 : memref<2048xi32>
+    aie.packet_flow(15) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_0, Ctrl : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(27) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_2, Ctrl : 0>
+    } {keep_pkt_header = true, priority_route = true}
+  }
+}
+
diff --git a/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir b/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir
new file mode 100644
index 0000000000..0efe95b979
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/input_with_addresses-test.mlir
@@ -0,0 +1,173 @@
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @out_cons : memref<16xi32>
+    memref.global "public" @out : memref<16xi32>
+    memref.global "public" @in2_cons : memref<16xi32>
+    memref.global "public" @in2 : memref<16xi32>
+    memref.global "public" @in1_cons : memref<16xi32>
+    memref.global "public" @in1 : memref<16xi32>
+    %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
+    %out_cons_prod_lock = aie.lock(%tile_0_0, 4) {init = 1 : i32, sym_name = "out_cons_prod_lock"}
+    %out_cons_cons_lock = aie.lock(%tile_0_0, 5) {init = 0 : i32, sym_name = "out_cons_cons_lock"}
+    %out_buff_0 = aie.buffer(%tile_0_2) {address = 1024 : i32, sym_name = "out_buff_0"} : memref<16xi32> 
+    %out_buff_1 = aie.buffer(%tile_0_2) {address = 1088 : i32, sym_name = "out_buff_1"} : memref<16xi32> 
+    %out_prod_lock = aie.lock(%tile_0_2, 4) {init = 2 : i32, sym_name = "out_prod_lock"}
+    %out_cons_lock = aie.lock(%tile_0_2, 5) {init = 0 : i32, sym_name = "out_cons_lock"}
+    %in2_cons_buff_0 = aie.buffer(%tile_0_2) {address = 1152 : i32, sym_name = "in2_cons_buff_0"} : memref<16xi32> 
+    %in2_cons_buff_1 = aie.buffer(%tile_0_2) {address = 1216 : i32, sym_name = "in2_cons_buff_1"} : memref<16xi32> 
+    %in2_cons_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "in2_cons_prod_lock"}
+    %in2_cons_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "in2_cons_cons_lock"}
+    %in2_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "in2_prod_lock"}
+    %in2_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "in2_cons_lock"}
+    %in1_cons_buff_0 = aie.buffer(%tile_0_2) {address = 1280 : i32, sym_name = "in1_cons_buff_0"} : memref<16xi32> 
+    %in1_cons_buff_1 = aie.buffer(%tile_0_2) {address = 1344 : i32, sym_name = "in1_cons_buff_1"} : memref<16xi32> 
+    %in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in1_cons_prod_lock"}
+    %in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in1_cons_cons_lock"}
+    %in1_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "in1_prod_lock"}
+    %in1_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "in1_cons_lock"}
+
+
+    aie.packet_flow(1) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_2, DMA : 0>
+    }
+    aie.packet_flow(2) {
+      aie.packet_source<%tile_0_0, DMA : 1>
+      aie.packet_dest<%tile_0_2, DMA : 1>
+    }
+    aie.packet_flow(3) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_2, DMA : 0>
+    }
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c9223372036854775807 = arith.constant 9223372036854775807 : index
+      %c1 = arith.constant 1 : index
+      cf.br ^bb1(%c0 : index)
+    ^bb1(%0: index):  // 2 preds: ^bb0, ^bb11
+      %1 = arith.cmpi slt, %0, %c9223372036854775807 : index
+      cf.cond_br %1, ^bb2, ^bb12
+    ^bb2:  // pred: ^bb1
+      %c0_0 = arith.constant 0 : index
+      %c256 = arith.constant 256 : index
+      %c1_1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      cf.br ^bb3(%c0_0 : index)
+    ^bb3(%2: index):  // 2 preds: ^bb2, ^bb10
+      %3 = arith.cmpi slt, %2, %c256 : index
+      cf.cond_br %3, ^bb4, ^bb11
+    ^bb4:  // pred: ^bb3
+      aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.use_lock(%in2_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
+      %c0_2 = arith.constant 0 : index
+      %c16 = arith.constant 16 : index
+      %c1_3 = arith.constant 1 : index
+      cf.br ^bb5(%c0_2 : index)
+    ^bb5(%4: index):  // 2 preds: ^bb4, ^bb6
+      %5 = arith.cmpi slt, %4, %c16 : index
+      cf.cond_br %5, ^bb6, ^bb7
+    ^bb6:  // pred: ^bb5
+      %6 = memref.load %in1_cons_buff_0[%4] : memref<16xi32>
+      %7 = memref.load %in2_cons_buff_0[%4] : memref<16xi32>
+      %8 = arith.muli %6, %7 : i32
+      memref.store %8, %out_buff_0[%4] : memref<16xi32>
+      %9 = arith.addi %4, %c1_3 : index
+      cf.br ^bb5(%9 : index)
+    ^bb7:  // pred: ^bb5
+      aie.use_lock(%in1_cons_prod_lock, Release, 1)
+      aie.use_lock(%in2_cons_prod_lock, Release, 1)
+      aie.use_lock(%out_cons_lock, Release, 1)
+      aie.use_lock(%in1_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.use_lock(%in2_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
+      %c0_4 = arith.constant 0 : index
+      %c16_5 = arith.constant 16 : index
+      %c1_6 = arith.constant 1 : index
+      cf.br ^bb8(%c0_4 : index)
+    ^bb8(%10: index):  // 2 preds: ^bb7, ^bb9
+      %11 = arith.cmpi slt, %10, %c16_5 : index
+      cf.cond_br %11, ^bb9, ^bb10
+    ^bb9:  // pred: ^bb8
+      %12 = memref.load %in1_cons_buff_1[%10] : memref<16xi32>
+      %13 = memref.load %in2_cons_buff_1[%10] : memref<16xi32>
+      %14 = arith.muli %12, %13 : i32
+      memref.store %14, %out_buff_1[%10] : memref<16xi32>
+      %15 = arith.addi %10, %c1_6 : index
+      cf.br ^bb8(%15 : index)
+    ^bb10:  // pred: ^bb8
+      aie.use_lock(%in1_cons_prod_lock, Release, 1)
+      aie.use_lock(%in2_cons_prod_lock, Release, 1)
+      aie.use_lock(%out_cons_lock, Release, 1)
+      %16 = arith.addi %2, %c2 : index
+      cf.br ^bb3(%16 : index)
+    ^bb11:  // pred: ^bb3
+      %17 = arith.addi %0, %c1 : index
+      cf.br ^bb1(%17 : index)
+    ^bb12:  // pred: ^bb1
+      aie.end
+    }
+    aie.shim_dma_allocation @in1(MM2S, 0, 0)
+    aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 3>) {id = 2 : i64, metadata = @out} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 1>) {id = 0 : i64, metadata = @in1} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0, 1], packet = <pkt_type = 0, pkt_id = 2>) {id = 1 : i64, metadata = @in2} : memref<4096xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+    aie.shim_dma_allocation @in2(MM2S, 1, 0)
+    aie.shim_dma_allocation @out(S2MM, 0, 0)
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%in1_cons_buff_0 : memref<16xi32>, 0, 16) {bd_id = 0 : i32, next_bd_id = 1 : i32}
+      aie.use_lock(%in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%in1_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%in1_cons_buff_1 : memref<16xi32>, 0, 16) {bd_id = 1 : i32, next_bd_id = 0 : i32}
+      aie.use_lock(%in1_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 1, ^bb4, ^bb6)
+    ^bb4:  // 2 preds: ^bb3, ^bb5
+      aie.use_lock(%in2_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%in2_cons_buff_0 : memref<16xi32>, 0, 16) {bd_id = 2 : i32, next_bd_id = 3 : i32}
+      aie.use_lock(%in2_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb5:  // pred: ^bb4
+      aie.use_lock(%in2_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%in2_cons_buff_1 : memref<16xi32>, 0, 16) {bd_id = 3 : i32, next_bd_id = 2 : i32}
+      aie.use_lock(%in2_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb4
+    ^bb6:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 0, ^bb7, ^bb9)
+    ^bb7:  // 2 preds: ^bb6, ^bb8
+      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%out_buff_0 : memref<16xi32>, 0, 16) {bd_id = 4 : i32, next_bd_id = 5 : i32}
+      aie.use_lock(%out_prod_lock, Release, 1)
+      aie.next_bd ^bb8
+    ^bb8:  // pred: ^bb7
+      aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%out_buff_1 : memref<16xi32>, 0, 16) {bd_id = 5 : i32, next_bd_id = 4 : i32}
+      aie.use_lock(%out_prod_lock, Release, 1)
+      aie.next_bd ^bb7
+    ^bb9:  // pred: ^bb6
+      aie.end
+    }
+    aie.packet_flow(15) {
+      aie.packet_source<%tile_0_0, Ctrl : 0>
+      aie.packet_dest<%tile_0_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(15) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_0, Ctrl : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(27) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_2, Ctrl : 0>
+    } {keep_pkt_header = true, priority_route = true}
+  }
+}
diff --git a/programming_examples/basic/vvmul-merged-ctrl/run.lit b/programming_examples/basic/vvmul-merged-ctrl/run.lit
new file mode 100644
index 0000000000..bd0094f177
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/run.lit
@@ -0,0 +1,14 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie1.xclbin --npu-insts-name=insts1.txt %S/aie1.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --npu-insts-name=insts2.txt %S/aie2.mlir
+// RUN: aie-translate -aie-ctrlpkt-to-bin aie2.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt
+// RUN: aie-opt -aie-ctrl-packet-infer-tiles -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" -aie-ctrl-packet-to-dma aie2.mlir.prj/ctrlpkt.mlir > aie3.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-only-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=aie3.xclbin --npu-insts-name=insts3.txt aie3.mlir
+
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/vvmul-merged-ctrl/run.sh b/programming_examples/basic/vvmul-merged-ctrl/run.sh
new file mode 100644
index 0000000000..1b4864bee2
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/run.sh
@@ -0,0 +1,7 @@
+python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie1.xclbin --npu-insts-name=insts1.txt ./aie1.mlir
+python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --npu-insts-name=insts2.txt ./aie2.mlir
+aie-translate -aie-ctrlpkt-to-bin aie2.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt
+aie-opt -aie-ctrl-packet-infer-tiles -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" -aie-ctrl-packet-to-dma aie2.mlir.prj/ctrlpkt.mlir > aie3.mlir
+python /scratch/aba/micro/mlir-aie/install/bin/aiecc.py --no-aiesim --aie-only-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=aie3.xclbin --npu-insts-name=insts3.txt aie3.mlir
+clang ./test.cpp -o test.exe -std=c++11 -Wall -I/opt/xilinx/xrt/include -L/opt/xilinx/xrt/lib -luuid -lxrt_coreutil -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+./test.exe
\ No newline at end of file
diff --git a/programming_examples/basic/vvmul-merged-ctrl/test.cpp b/programming_examples/basic/vvmul-merged-ctrl/test.cpp
new file mode 100644
index 0000000000..d6e47ab306
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-ctrl/test.cpp
@@ -0,0 +1,183 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "experimental/xrt_kernel.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 4096;
+constexpr int OUT_SIZE = IN_SIZE;
+constexpr int CTRL_IN_SIZE = 1024;
+
+#define IN_DATATYPE std::uint32_t
+#define OUT_DATATYPE std::uint32_t
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+  // AIE design's data streams
+  std::vector<uint32_t> instr2_v = load_instr_sequence("insts2.txt");
+  // AIE configuration as control packet streams
+  std::vector<uint32_t> instr3_cfg_v = load_instr_sequence("insts3.txt");
+  // AIE configuration control packets' raw data
+  std::vector<uint32_t> ctrlPackets = load_instr_sequence("ctrlpkt.txt");
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  // Skeleton xclbin containing only the control packet network
+  auto xclbin = xrt::xclbin("aie1.xclbin");
+
+  // std::string Node = "MLIR_AIE";
+
+  // // Get the kernel from the xclbin
+  // auto xkernels = xclbin.get_kernels();
+  // auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+  //                              [Node](xrt::xclbin::kernel &k) {
+  //                                auto name = k.get_name();
+  //                                std::cout << "Name: " << name << std::endl;
+  //                                return name.rfind(Node, 0) == 0;
+  //                              });
+  // auto kernelName = xkernel.get_name();
+  auto kernelName = "MLIR_AIE";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr3 = xrt::bo(device, instr3_cfg_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_instr2 = xrt::bo(device, instr2_v.size() * sizeof(int),
+                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+  auto bo_ctrlpkt = xrt::bo(device, CTRL_IN_SIZE * sizeof(int32_t),
+                            XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+
+  IN_DATATYPE *bufInA = bo_inA.map<IN_DATATYPE *>();
+  IN_DATATYPE *bufInB = bo_inB.map<IN_DATATYPE *>();
+  std::vector<IN_DATATYPE> srcVecA, srcVecB;
+  for (int i = 0; i < IN_SIZE; i++) {
+    srcVecA.push_back(1);
+    srcVecB.push_back(1);
+  }
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE)));
+  memcpy(bufInB, srcVecB.data(), (srcVecB.size() * sizeof(IN_DATATYPE)));
+
+  void *bufInstr2 = bo_instr2.map<void *>();
+  memcpy(bufInstr2, instr2_v.data(), instr2_v.size() * sizeof(int));
+
+  void *bufInstr3 = bo_instr3.map<void *>();
+  memcpy(bufInstr3, instr3_cfg_v.data(), instr3_cfg_v.size() * sizeof(int));
+
+  void *bufctrlpkt = bo_ctrlpkt.map<void *>();
+  memcpy(bufctrlpkt, ctrlPackets.data(), ctrlPackets.size() * sizeof(int));
+
+  // Synchronizing BOs
+  bo_instr3.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_ctrlpkt.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+
+  // Creating a runlist to contain two seperate runs
+  xrt::runlist runlist = xrt::runlist(context);
+
+  // Run 0: configuration
+  auto run0 = xrt::run(kernel);
+  run0.set_arg(0, opcode);
+  run0.set_arg(1, bo_instr3);
+  run0.set_arg(2, instr3_cfg_v.size());
+  run0.set_arg(3, bo_ctrlpkt);
+  run0.set_arg(4, 0);
+  run0.set_arg(5, 0);
+  run0.set_arg(6, 0);
+  run0.set_arg(7, 0);
+  // Run 1: the design
+  auto run1 = xrt::run(kernel);
+  run1.set_arg(0, opcode);
+  run1.set_arg(1, bo_instr2);
+  run1.set_arg(2, instr2_v.size());
+  run1.set_arg(3, bo_inA);
+  run1.set_arg(4, bo_inB);
+  run1.set_arg(5, bo_out);
+  run1.set_arg(6, 0);
+  run1.set_arg(7, 0);
+
+  // Executing and waiting on the runlist
+  runlist.add(run0);
+  runlist.add(run1);
+  runlist.execute();
+  runlist.wait();
+
+  // bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  // OUT_DATATYPE *bufOut = bo_out.map<OUT_DATATYPE *>();
+
+  // int errors = 0;
+
+  // for (uint32_t i = 0; i < 64; i++) {
+  //   for (uint32_t j = 0; j < 64; j++) {
+  //     uint32_t ref = 1 + 12;
+  //     if (*(bufOut + i * 64 + j) != ref) {
+  //       std::cout << "Error in output " << std::to_string(bufOut[i * 64 + j])
+  //                 << " != " << ref << std::endl;
+  //       errors++;
+  //     }
+  //     // else
+  //     //   std::cout << "Correct output " << std::to_string(bufOut[i * 64 + j])
+  //     //             << " == " << ref << std::endl;
+  //   }
+  // }
+
+  // if (!errors) {
+  //   std::cout << "\nPASS!\n\n";
+  //   return 0;
+  // }
+
+  // std::cout << "\nfailed.\n\n";
+  // return 1;
+}
diff --git a/programming_examples/basic/vvmul-merged-ctrl/test.exe b/programming_examples/basic/vvmul-merged-ctrl/test.exe
new file mode 100755
index 0000000000..f75430a527
Binary files /dev/null and b/programming_examples/basic/vvmul-merged-ctrl/test.exe differ
diff --git a/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt b/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt
new file mode 100644
index 0000000000..68cfdbd97f
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/CMakeLists.txt
@@ -0,0 +1,77 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+    DATASIZE=${DATASIZE}
+    DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/vvmul-merged-xclbin/Makefile b/programming_examples/basic/vvmul-merged-xclbin/Makefile
new file mode 100755
index 0000000000..d6f5e4174c
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/Makefile
@@ -0,0 +1,84 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include ${srcdir}/../../makefile-common
+
+data_size = 8192
+targetname = vectorMult
+devicename = npu
+col = 0
+
+VPATH := ${srcdir}/../../../aie_kernels/aie2
+
+build2/%.o: %.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+
+all: build/final.xclbin
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} ${data_size} > $@
+
+build/final.xclbin: build/aie.mlir build2/final.xclbin
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --xclbin-kernel-name=VVM --xclbin-kernel-id=0x902 \
+				--xclbin-instance-name=VVMINST --no-aiesim  --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--xclbin-input=../build2/final.xclbin  \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+build2/aie.mlir: ../vector_scalar_mul/aie2.py
+	mkdir -p ${@D}
+	python3 $< ${data_size} 0 > $@
+
+build2/final.xclbin: build2/aie.mlir  build2/scale.o
+	cd build2/ && aiecc.py --xclbin-kernel-name=VSM --xclbin-kernel-id=0x901 \
+			--xclbin-instance-name=VSMINST --no-aiesim  --aie-generate-cdo --no-compile-host --xclbin-name=final.xclbin \
+			--aie-generate-npu --npu-insts-name=insts.txt aie.mlir
+
+
+${targetname}.exe: ${srcdir}/test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname} -DDATASIZE=${data_size}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+
+vck5000: build/aie.mlir
+	aiecc.py --xchesscc --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I/opt/xaiengine/include \
+		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
+		-L/opt/xaiengine/lib \
+		-L/lib/x86_64-linux-gnu/ \
+		${srcdir}/test_vck5000.cpp \
+		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+		-Wl,-R/opt/xaiengine/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt 
+	${powershell} ./$< --xclbin build/final.xclbin \
+	--instr build/insts.txt --instr_2 build2/insts.txt -k VVM
+
+clean:
+	rm -rf build build2 _build inst aie.mlir.prj core_* test.elf ${targetname}.exe build2/final.xclbin ../vector_scalar_mul/aie.mlir
diff --git a/programming_examples/basic/vvmul-merged-xclbin/README.md b/programming_examples/basic/vvmul-merged-xclbin/README.md
new file mode 100644
index 0000000000..56b0a6cde0
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/README.md
@@ -0,0 +1,57 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Vector Vector Multiply</ins>
+
+A simple binary operator, which uses a single AIE core to multiply two vectors together.  The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the ObjectFIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
+
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory.
+
+## Source Files Overview
+
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+
+1. `test.cpp`: This C++ code is a testbench for the design example targetting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targetting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.
+
+## Ryzen™ AI Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make
+make vectorAdd.exe
+```
+
+To run the design:
+
+```
+make run
+```
+
+## VCK5000 Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make vck5000
+```
+
+To run the design:
+
+```
+./test.elf
+```
+
diff --git a/programming_examples/basic/vvmul-merged-xclbin/aie2.py b/programming_examples/basic/vvmul-merged-xclbin/aie2.py
new file mode 100644
index 0000000000..2400c6c36a
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/aie2.py
@@ -0,0 +1,94 @@
+# vector_vector_mul/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import sys
+
+
+def my_vector_mul(vector_size):
+    N = vector_size
+    n = 16
+    N_div_n = N // n
+
+    buffer_depth = 2
+
+    if len(sys.argv) != 4:
+        raise ValueError("[ERROR] Need 3 command line arguments (Device name, Col, size)")
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(n, T.i32())
+
+        # AIE Core Function declarations
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                for _ in for_(N_div_n):
+                    elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                    elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    for i in for_(n):
+                        v0 = memref.load(elem_in1, [i])
+                        v1 = memref.load(elem_in2, [i])
+                        v2 = arith.muli(v0, v1)
+                        memref.store(v2, elem_out, [i])
+                        yield_([])
+                    of_in1.release(ObjectFifoPort.Consume, 1)
+                    of_in2.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+                yield_([])
+
+        # To/from AIE-array data movement
+        tensor_ty = T.memref(N, T.i32())
+
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    vector_size = int(sys.argv[3])
+    my_vector_mul(vector_size)
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit b/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit
new file mode 100644
index 0000000000..6875524001
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit b/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit
new file mode 100644
index 0000000000..dcaa9f99c5
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/run_vck5000.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa, chess
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile vck5000
+// RUN: %run_on_vck5000 ./test.elf
diff --git a/programming_examples/basic/vvmul-merged-xclbin/sweep.sh b/programming_examples/basic/vvmul-merged-xclbin/sweep.sh
new file mode 100644
index 0000000000..579a545205
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/sweep.sh
@@ -0,0 +1,9 @@
+M_lo=4096
+M_step=4096
+M_hi=12288
+
+Ms=$(seq $M_lo $M_step $M_hi)
+
+for M in $Ms; do
+    make clean && make run data_size=$M
+done
\ No newline at end of file
diff --git a/programming_examples/basic/vvmul-merged-xclbin/test.cpp b/programming_examples/basic/vvmul-merged-xclbin/test.cpp
new file mode 100644
index 0000000000..40c2d64f8e
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/test.cpp
@@ -0,0 +1,190 @@
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+namespace po = boost::program_options;
+using TY_2 = std::uint32_t;
+const int scaleFactor = 3;
+
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  desc.add_options()("help,h", "produce help message")(
+    "xclbin", po::value<std::string>()->required(),
+    "the input xclbin path2")(
+    "kernel,k", po::value<std::string>()->required(),
+    "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+    "verbosity,v", po::value<int>()->default_value(0),
+    "the verbosity of the output")(
+    "instr", po::value<std::string>()->required(),
+    "path of file containing userspace instructions sent to the NPU")(
+    "instr_2", po::value<std::string>()->required(),
+    "path of file containing userspace instructions sent to the NPU")(
+    "verify", po::value<bool>()->default_value(true),
+    "whether to verify the AIE computed output")(
+    "iters", po::value<int>()->default_value(1))(
+    "warmup", po::value<int>()->default_value(0))(
+    "trace_sz,t", po::value<int>()->default_value(0))(
+    "trace_file", po::value<std::string>()->default_value("trace.txt"),
+    "where to store trace output");
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  constexpr int IN_SIZE_1 = DATASIZE;
+  constexpr int IN_VOLUME_2 = DATASIZE;
+
+  constexpr int OUT_SIZE_1 = DATASIZE;
+  constexpr int OUT_VOLUME_2 = DATASIZE;
+  int IN_SIZE_2 = IN_VOLUME_2 * sizeof(TY_2);
+  int OUT_SIZE_2 = OUT_VOLUME_2 * sizeof(TY_2) + trace_size;
+
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+  std::vector<uint32_t> instr_v1 = test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>()); // Load the xclbin
+  device.register_xclbin(xclbin); // Register xclbin
+  xrt::hw_context context_1(device, xclbin.get_uuid()); // Get a hardware context
+  auto kernel_1 = xrt::kernel(context_1, "VVM");
+
+  std::vector<uint32_t> instr_v2 = test_utils::load_instr_sequence(vm["instr_2"].as<std::string>());
+  auto kernel_2 = xrt::kernel(context_1, "VSM"); // Get a kernel_2 handle: MLIR_AIE
+
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+
+  auto bo_instr_1 = xrt::bo(device, instr_v1.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_1.group_id(1));
+  void *bufInstr_1 = bo_instr_1.map<void *>();
+  memcpy(bufInstr_1, instr_v1.data(), instr_v1.size() * sizeof(int));
+
+  auto bo_inA_1   = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(3));
+  uint32_t *bufInA_1 = bo_inA_1.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA_1;
+  for (int i = 0; i < IN_SIZE_1; i++)
+    srcVecA_1.push_back(i + 1);
+  memcpy(bufInA_1, srcVecA_1.data(), (srcVecA_1.size() * sizeof(uint32_t)));
+
+  auto bo_inB_1   = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(4));
+  uint32_t *bufInB_1 = bo_inB_1.map<uint32_t *>();
+  std::vector<uint32_t> srcVecB_1;
+  for (int i = 0; i < IN_SIZE_1; i++)
+    srcVecB_1.push_back(i);
+  memcpy(bufInB_1, srcVecB_1.data(), (srcVecB_1.size() * sizeof(uint32_t)));
+
+  auto bo_out_1   = xrt::bo(device, OUT_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(5));
+
+
+  auto bo_instr_2  = xrt::bo(device, instr_v2.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_2.group_id(1));
+  void *bufInstr_2 = bo_instr_2.map<void *>();
+  memcpy(bufInstr_2, instr_v2.data(), instr_v2.size() * sizeof(int));
+
+  auto bo_inA_2  = xrt::bo(device, IN_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(3));
+  TY_2 *bufInA_2 = bo_inA_2.map<TY_2 *>();
+  for (int i = 0; i < IN_VOLUME_2; i++)
+    bufInA_2[i] = i + 1;
+
+  auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(4));
+  int32_t *bufInFactor = bo_inFactor.map<int32_t *>();
+  *bufInFactor = (TY_2)scaleFactor;
+
+  auto bo_outC_2 = xrt::bo(device, OUT_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(5));
+
+
+  bo_instr_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inB_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  bo_instr_2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA_2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // RUN KERNEL
+
+  #define SIZE 4096
+
+  unsigned int opcode = 3;
+
+  std::ofstream f_time;
+  std::string file_name = "time_" + std::to_string(DATASIZE) + ".txt";
+  f_time.open(file_name);
+  for (int i=1; i<=1000; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    auto run1 = kernel_1(opcode, bo_instr_1, instr_v1.size(), bo_inA_1, bo_inB_1, bo_out_1);
+    run1.wait();
+    auto run2 = kernel_2(opcode, bo_instr_2, instr_v2.size(), bo_inA_2, bo_inFactor, bo_outC_2);
+    run2.wait();
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+    if (i<11)
+      std::cout << i << " " << IN_SIZE_1 << " NPU time: " << npu_time << "us." << std::endl;
+    f_time << npu_time << "\n";
+  }
+  f_time.close();
+
+  bo_out_1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  bo_outC_2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut_1 = bo_out_1.map<uint32_t *>();
+  TY_2 *bufOut_2 = bo_outC_2.map<TY_2 *>();
+
+
+  // COMPARE
+  int errors = 0;
+
+  for (uint32_t i = 0; i < SIZE; i++) {
+    if (*(bufOut_1 + i) != *(bufInA_1 + i) * *(bufInB_1 + i)) {
+      std::cout << "Error in output " << *(bufOut_1 + i)
+                << " != " << *(bufInA_1 + i) << " * " << *(bufInB_1 + i)
+                << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << *(bufOut_1 + i)
+                  << " == " << *(bufInA_1 + i) * *(bufInB_1 + i) << std::endl;
+    }
+  }
+
+  for (uint32_t i = 0; i < SIZE; i++) {
+    int32_t ref = bufInA_2[i] * scaleFactor;
+    int32_t test = bufOut_2[i];
+    if (test != ref) {
+      if (verbosity >= 1)
+        std::cout << "Error in output " << test << " != " << ref << std::endl;
+      errors++;
+    } else {
+      if (verbosity >= 1)
+        std::cout << "Correct output " << test << " == " << ref << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nfailed with errors:" << errors << ".\n\n";
+    return 1;
+  }
+  return 0;
+}
diff --git a/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp b/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp
new file mode 100644
index 0000000000..ab2096bef9
--- /dev/null
+++ b/programming_examples/basic/vvmul-merged-xclbin/test_vck5000.cpp
@@ -0,0 +1,138 @@
+//===- test_vck5000.cpp -----------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <xaiengine.h>
+
+#include "memory_allocator.h"
+#include "test_library.h"
+
+#include "aie_data_movement.cpp"
+#include "aie_inc.cpp"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+constexpr int DMA_COUNT = 256;
+
+void hsa_check_status(const std::string func_name, hsa_status_t status) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string(new char[1024]);
+    hsa_status_string(status, &status_string);
+    std::cout << func_name << " failed: " << status_string << std::endl;
+    delete[] status_string;
+  } else {
+    std::cout << func_name << " success" << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t row = 0;
+  uint64_t col = 6;
+
+  std::vector<hsa_queue_t *> queues;
+  uint32_t aie_max_queue_size(0);
+
+  aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie();
+
+  // This is going to initialize HSA, create a queue
+  // and get an agent
+  int ret = mlir_aie_init_device(xaie);
+
+  if (ret) {
+    std::cout << "[ERROR] Error when calling mlir_aie_init_device)"
+              << std::endl;
+    return -1;
+  }
+
+  // Getting access to all of the HSA agents
+  std::vector<hsa_agent_t> agents = xaie->agents;
+
+  if (agents.empty()) {
+    std::cout << "No agents found. Exiting." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Found " << agents.size() << " agents" << std::endl;
+
+  hsa_queue_t *q = xaie->cmd_queue;
+
+  // Adding to our vector of queues
+  queues.push_back(q);
+  assert(queues.size() > 0 && "No queues were sucesfully created!");
+
+  mlir_aie_configure_cores(xaie);
+  mlir_aie_configure_switchboxes(xaie);
+  mlir_aie_initialize_locks(xaie);
+  mlir_aie_configure_dmas(xaie);
+  mlir_aie_start_cores(xaie);
+
+  // Allocating some device memory
+  ext_mem_model_t buf0, buf1, buf2;
+  uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT);
+  uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT);
+  uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(xaie, buf2, DMA_COUNT);
+  mlir_aie_sync_mem_dev(buf0);
+  mlir_aie_sync_mem_dev(buf1);
+  mlir_aie_sync_mem_dev(buf2);
+
+  if (in_a == nullptr || in_b == nullptr || out == nullptr) {
+    std::cout << "Could not allocate in device memory" << std::endl;
+    return -1;
+  }
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    in_a[i] = i + 1;
+    in_b[i] = i;
+    out[i] = 0xdeface;
+  }
+
+  // Pass arguments in the order of dma_memcpys in the mlir
+  invoke_data_movement(queues[0], &agents[0], out, in_a, in_b);
+
+  int errors = 0;
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    uint32_t s0 = in_a[i];
+    uint32_t s1 = in_b[i];
+    uint32_t d = out[i];
+    printf("s0[%d] = 0x%x\n", i, s0);
+    printf("s1[%d] = 0x%x\n", i, s1);
+    printf("d[%d] = 0x%x\n", i, d);
+    if (d != (s0 * s1)) {
+      errors++;
+      printf("mismatch 0x%x != 0x%x + 0x%x\n", d, s0, s1);
+    }
+  }
+
+  // destroying the queue
+  hsa_queue_destroy(queues[0]);
+
+  // Shutdown AIR and HSA
+  mlir_aie_deinit_libxaie(xaie);
+
+  if (!errors) {
+    printf("PASS!\n");
+    return 0;
+  } else {
+    printf("fail %d/%d.\n", errors, DMA_COUNT);
+    return -1;
+  }
+}
diff --git a/programming_examples/basic/vvmul/CMakeLists.txt b/programming_examples/basic/vvmul/CMakeLists.txt
new file mode 100644
index 0000000000..68cfdbd97f
--- /dev/null
+++ b/programming_examples/basic/vvmul/CMakeLists.txt
@@ -0,0 +1,77 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC 
+    DATASIZE=${DATASIZE}
+    DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/vvmul/Makefile b/programming_examples/basic/vvmul/Makefile
new file mode 100755
index 0000000000..b9d833cafb
--- /dev/null
+++ b/programming_examples/basic/vvmul/Makefile
@@ -0,0 +1,79 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include ${srcdir}/../../makefile-common
+
+VPATH := ${srcdir}/../../../aie_kernels/aie2
+
+data_size = 4096
+targetname = vectorMult
+devicename = npu
+col = 0
+
+all: build/final.xclbin
+
+build2/%.o: %.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< ${devicename} ${col} ${data_size} > $@
+
+build/final.xclbin: build/aie.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
+
+${targetname}.exe: ${srcdir}/test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake ${srcdir} -DTARGET_NAME=${targetname}  -DDATASIZE=${data_size}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+
+build2/aie.mlir: ../vector_scalar_mul/aie2.py
+	mkdir -p ${@D}
+	python3 $< ${data_size} 0 > $@
+
+build2/final.xclbin: build2/aie.mlir build2/scale.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+
+# Changing variables when we target VCK5000
+vck5000: devicename=xcvc1902
+vck5000: col=6
+
+vck5000: build/aie.mlir
+	aiecc.py --xchesscc --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+		-I/opt/xaiengine/include \
+		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
+		-L/opt/xaiengine/lib \
+		-L/lib/x86_64-linux-gnu/ \
+		${srcdir}/test_vck5000.cpp \
+		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+		-Wl,-R/opt/xaiengine/lib \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
+
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt build2/final.xclbin
+	${powershell} ./$< --xclbin build/final.xclbin --xclbin_2 build2/final.xclbin --instr build/insts.txt --instr_2 build2/insts.txt -k MLIR_AIE
+
+clean:
+	rm -rf build build2 _build inst aie.mlir.prj core_* test.elf ${targetname}.exe build2/aie.mlir build2/final.xclbin
diff --git a/programming_examples/basic/vvmul/README.md b/programming_examples/basic/vvmul/README.md
new file mode 100644
index 0000000000..56b0a6cde0
--- /dev/null
+++ b/programming_examples/basic/vvmul/README.md
@@ -0,0 +1,57 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Vector Vector Multiply</ins>
+
+A simple binary operator, which uses a single AIE core to multiply two vectors together.  The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the ObjectFIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
+
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory.
+
+## Source Files Overview
+
+1. `aie2.py`: defines the AIE array structural design using IRON AIE language bindings. This generates mlir-aie that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.txt for the NPU in Ryzen™ AI). 
+
+1. `test.cpp`: This C++ code is a testbench for the design example targetting Ryzen™ AI (AIE-ML). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+1. `test_vck5000.cpp`: This C++ code is a testbench for the design example targetting the VCK5000 PCIe card (AIE). The code is responsible for configuring the AIEs, allocating memory, providing input data, and executing the AIE design on the VCK5000. After executing, the program verifies the results.
+
+## Ryzen™ AI Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make
+make vectorAdd.exe
+```
+
+To run the design:
+
+```
+make run
+```
+
+## VCK5000 Usage
+
+### C++ Testbench
+
+To compile the design and C++ testbench:
+
+```
+make vck5000
+```
+
+To run the design:
+
+```
+./test.elf
+```
+
diff --git a/programming_examples/basic/vvmul/aie2.py b/programming_examples/basic/vvmul/aie2.py
new file mode 100644
index 0000000000..7cd85589ef
--- /dev/null
+++ b/programming_examples/basic/vvmul/aie2.py
@@ -0,0 +1,89 @@
+# vector_vector_mul/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import sys
+
+
+def my_vector_mul(N):
+    n = 16
+    N_div_n = N // n
+
+    buffer_depth = 2
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(n, T.i32())
+
+        # AIE Core Function declarations
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        of_in1 = object_fifo("in1", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_in2 = object_fifo("in2", ShimTile, ComputeTile2, buffer_depth, memRef_ty)
+        of_out = object_fifo("out", ComputeTile2, ShimTile, buffer_depth, memRef_ty)
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                # Number of sub-vector "tile" iterations
+                for _ in for_(N_div_n):
+                    elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                    elem_in2 = of_in2.acquire(ObjectFifoPort.Consume, 1)
+                    elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    for i in for_(n):
+                        v0 = memref.load(elem_in1, [i])
+                        v1 = memref.load(elem_in2, [i])
+                        v2 = arith.muli(v0, v1)
+                        memref.store(v2, elem_out, [i])
+                        yield_([])
+                    of_in1.release(ObjectFifoPort.Consume, 1)
+                    of_in2.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+                yield_([])
+
+        # To/from AIE-array data movement
+        tensor_ty = T.memref(N, T.i32())
+
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    my_vector_mul(int(sys.argv[3]))
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/vvmul/run_makefile.lit b/programming_examples/basic/vvmul/run_makefile.lit
new file mode 100644
index 0000000000..6875524001
--- /dev/null
+++ b/programming_examples/basic/vvmul/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/vvmul/run_vck5000.lit b/programming_examples/basic/vvmul/run_vck5000.lit
new file mode 100644
index 0000000000..dcaa9f99c5
--- /dev/null
+++ b/programming_examples/basic/vvmul/run_vck5000.lit
@@ -0,0 +1,8 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa, chess
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile vck5000
+// RUN: %run_on_vck5000 ./test.elf
diff --git a/programming_examples/basic/vvmul/sweep.sh b/programming_examples/basic/vvmul/sweep.sh
new file mode 100644
index 0000000000..579a545205
--- /dev/null
+++ b/programming_examples/basic/vvmul/sweep.sh
@@ -0,0 +1,9 @@
+M_lo=4096
+M_step=4096
+M_hi=12288
+
+Ms=$(seq $M_lo $M_step $M_hi)
+
+for M in $Ms; do
+    make clean && make run data_size=$M
+done
\ No newline at end of file
diff --git a/programming_examples/basic/vvmul/test.cpp b/programming_examples/basic/vvmul/test.cpp
new file mode 100644
index 0000000000..25e8c53295
--- /dev/null
+++ b/programming_examples/basic/vvmul/test.cpp
@@ -0,0 +1,199 @@
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+namespace po = boost::program_options;
+using TY_2 = std::uint32_t;
+const int scaleFactor = 3;
+
+int main(int argc, const char *argv[]) {
+
+  // ------------------------------------------------------
+  // Parse program arguments
+  // ------------------------------------------------------
+  po::options_description desc("Allowed options");
+  po::variables_map vm;
+  desc.add_options()("help,h", "produce help message")(
+    "xclbin", po::value<std::string>()->required(),
+    "the input xclbin path2")(
+    "xclbin_2", po::value<std::string>()->required(),
+    "the input xclbin path2")(
+    "kernel_2,k", po::value<std::string>()->required(),
+    "the kernel_2 name in the XCLBIN (for instance PP_PRE_FD)")(
+    "verbosity,v", po::value<int>()->default_value(0),
+    "the verbosity of the output")(
+    "instr", po::value<std::string>()->required(),
+    "path of file containing userspace instructions sent to the NPU")(
+    "instr_2", po::value<std::string>()->required(),
+    "path of file containing userspace instructions sent to the NPU")(
+    "verify", po::value<bool>()->default_value(true),
+    "whether to verify the AIE computed output")(
+    "iters", po::value<int>()->default_value(1))(
+    "warmup", po::value<int>()->default_value(0))(
+    "trace_sz,t", po::value<int>()->default_value(0))(
+    "trace_file", po::value<std::string>()->default_value("trace.txt"),
+    "where to store trace output");
+
+  test_utils::parse_options(argc, argv, desc, vm);
+  int verbosity = vm["verbosity"].as<int>();
+  int do_verify = vm["verify"].as<bool>();
+  int n_iterations = vm["iters"].as<int>();
+  int n_warmup_iterations = vm["warmup"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
+
+  constexpr int IN_SIZE_1 = DATASIZE;
+  constexpr int OUT_SIZE_1 = IN_SIZE_1;
+
+  constexpr int IN_VOLUME_2 = DATASIZE;
+  constexpr int OUT_VOLUME_2 = IN_VOLUME_2;
+  int IN_SIZE_2 = IN_VOLUME_2 * sizeof(TY_2);
+  int OUT_SIZE_2 = OUT_VOLUME_2 * sizeof(TY_2) + trace_size;
+
+  // ------------------------------------------------------
+  // Get device, load the xclbin & kernel_2 and register them
+  // ------------------------------------------------------
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto kernelName = "MLIR_AIE";
+  auto device = xrt::device(device_index);
+
+  std::vector<uint32_t> instr_v1 = test_utils::load_instr_sequence(vm["instr"].as<std::string>());
+  auto xclbin_1 = xrt::xclbin(vm["xclbin"].as<std::string>()); // Load the xclbin
+  device.register_xclbin(xclbin_1); // Register xclbin
+  xrt::hw_context context_1(device, xclbin_1.get_uuid()); // Get a hardware context
+  auto kernel_1 = xrt::kernel(context_1, kernelName); // Get a kernel_2 handle: MLIR_AIE
+
+  std::vector<uint32_t> instr_v2 = test_utils::load_instr_sequence(vm["instr_2"].as<std::string>());
+  auto xclbin_2 = xrt::xclbin(vm["xclbin_2"].as<std::string>()); // Load the xclbin
+  device.register_xclbin(xclbin_2); // Register xclbin
+  xrt::hw_context context_2(device, xclbin_2.get_uuid()); // Get a hardware context
+  auto kernel_2 = xrt::kernel(context_2, kernelName); // Get a kernel_2 handle: MLIR_AIE
+
+
+  // ------------------------------------------------------
+  // Initialize input/ output buffer sizes and sync them
+  // ------------------------------------------------------
+
+  auto bo_instr_1 = xrt::bo(device, instr_v1.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_1.group_id(1));
+  void *bufInstr_1 = bo_instr_1.map<void *>();
+  memcpy(bufInstr_1, instr_v1.data(), instr_v1.size() * sizeof(int));
+
+  auto bo_inA_1   = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(3));
+  uint32_t *bufInA_1 = bo_inA_1.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA_1;
+  for (int i = 0; i < IN_SIZE_1; i++)
+    srcVecA_1.push_back(i + 1);
+  memcpy(bufInA_1, srcVecA_1.data(), (srcVecA_1.size() * sizeof(uint32_t)));
+
+  auto bo_inB_1   = xrt::bo(device, IN_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(4));
+  uint32_t *bufInB_1 = bo_inB_1.map<uint32_t *>();
+  std::vector<uint32_t> srcVecB_1;
+  for (int i = 0; i < IN_SIZE_1; i++)
+    srcVecB_1.push_back(i);
+  memcpy(bufInB_1, srcVecB_1.data(), (srcVecB_1.size() * sizeof(uint32_t)));
+
+  auto bo_out_1   = xrt::bo(device, OUT_SIZE_1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_1.group_id(5));
+
+
+  auto bo_instr_2  = xrt::bo(device, instr_v2.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel_2.group_id(1));
+  void *bufInstr_2 = bo_instr_2.map<void *>();
+  memcpy(bufInstr_2, instr_v2.data(), instr_v2.size() * sizeof(int));
+
+  auto bo_inA_2  = xrt::bo(device, IN_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(3));
+  TY_2 *bufInA_2 = bo_inA_2.map<TY_2 *>();
+  for (int i = 0; i < IN_VOLUME_2; i++)
+    bufInA_2[i] = i + 1;
+
+  auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(4));
+  int32_t *bufInFactor = bo_inFactor.map<int32_t *>();
+  *bufInFactor = (TY_2)scaleFactor;
+
+  auto bo_outC_2 = xrt::bo(device, OUT_SIZE_2, XRT_BO_FLAGS_HOST_ONLY, kernel_2.group_id(5));
+
+
+  bo_instr_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inB_1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  bo_instr_2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA_2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // RUN KERNEL
+
+  unsigned int opcode = 3;
+
+  std::ofstream f_time;
+  std::string file_name = "time_" + std::to_string(DATASIZE) + ".txt";
+  f_time.open(file_name);
+  for (int i=1; i<=1000; i++) {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    auto run1 = kernel_1(opcode, bo_instr_1, instr_v1.size(), bo_inA_1, bo_inB_1, bo_out_1);
+    run1.wait();
+    auto run2 = kernel_2(opcode, bo_instr_2, instr_v2.size(), bo_inA_2, bo_inFactor, bo_outC_2);
+    run2.wait();
+
+    auto stop = std::chrono::high_resolution_clock::now();
+    float npu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+    if (i<11)
+      std::cout << i << " " << IN_SIZE_1 << " NPU time: " << npu_time << "us." << std::endl;
+    f_time << npu_time << "\n";
+  }
+  f_time.close();
+
+  bo_out_1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut_1 = bo_out_1.map<uint32_t *>();
+
+  bo_outC_2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  TY_2 *bufOut_2 = bo_outC_2.map<TY_2 *>();
+
+  // COMPARE
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < OUT_SIZE_1; i++) {
+    if (*(bufOut_1 + i) != *(bufInA_1 + i) * *(bufInB_1 + i)) {
+      std::cout << "Error in output " << *(bufOut_1 + i)
+                << " != " << *(bufInA_1 + i) << " * " << *(bufInB_1 + i)
+                << std::endl;
+      errors++;
+    } else {
+      if (verbosity > 1)
+        std::cout << "Correct output " << *(bufOut_1 + i)
+                  << " == " << *(bufInA_1 + i) * *(bufInB_1 + i) << std::endl;
+    }
+  }
+
+  for (uint32_t i = 0; i < IN_VOLUME_2; i++) {
+    int32_t ref = bufInA_2[i] * scaleFactor;
+    int32_t test = bufOut_2[i];
+    if (test != ref) {
+      if (verbosity >= 1)
+        std::cout << "Error in output " << test << " != " << ref << std::endl;
+      errors++;
+    } else {
+      if (verbosity >= 1)
+        std::cout << "Correct output " << test << " == " << ref << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nfailed.\n\n";
+    return 1;
+  }
+}
diff --git a/programming_examples/basic/vvmul/test_vck5000.cpp b/programming_examples/basic/vvmul/test_vck5000.cpp
new file mode 100644
index 0000000000..ab2096bef9
--- /dev/null
+++ b/programming_examples/basic/vvmul/test_vck5000.cpp
@@ -0,0 +1,138 @@
+//===- test_vck5000.cpp -----------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <xaiengine.h>
+
+#include "memory_allocator.h"
+#include "test_library.h"
+
+#include "aie_data_movement.cpp"
+#include "aie_inc.cpp"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+constexpr int DMA_COUNT = 256;
+
+void hsa_check_status(const std::string func_name, hsa_status_t status) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string(new char[1024]);
+    hsa_status_string(status, &status_string);
+    std::cout << func_name << " failed: " << status_string << std::endl;
+    delete[] status_string;
+  } else {
+    std::cout << func_name << " success" << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t row = 0;
+  uint64_t col = 6;
+
+  std::vector<hsa_queue_t *> queues;
+  uint32_t aie_max_queue_size(0);
+
+  aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie();
+
+  // This is going to initialize HSA, create a queue
+  // and get an agent
+  int ret = mlir_aie_init_device(xaie);
+
+  if (ret) {
+    std::cout << "[ERROR] Error when calling mlir_aie_init_device)"
+              << std::endl;
+    return -1;
+  }
+
+  // Getting access to all of the HSA agents
+  std::vector<hsa_agent_t> agents = xaie->agents;
+
+  if (agents.empty()) {
+    std::cout << "No agents found. Exiting." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Found " << agents.size() << " agents" << std::endl;
+
+  hsa_queue_t *q = xaie->cmd_queue;
+
+  // Adding to our vector of queues
+  queues.push_back(q);
+  assert(queues.size() > 0 && "No queues were sucesfully created!");
+
+  mlir_aie_configure_cores(xaie);
+  mlir_aie_configure_switchboxes(xaie);
+  mlir_aie_initialize_locks(xaie);
+  mlir_aie_configure_dmas(xaie);
+  mlir_aie_start_cores(xaie);
+
+  // Allocating some device memory
+  ext_mem_model_t buf0, buf1, buf2;
+  uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT);
+  uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT);
+  uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(xaie, buf2, DMA_COUNT);
+  mlir_aie_sync_mem_dev(buf0);
+  mlir_aie_sync_mem_dev(buf1);
+  mlir_aie_sync_mem_dev(buf2);
+
+  if (in_a == nullptr || in_b == nullptr || out == nullptr) {
+    std::cout << "Could not allocate in device memory" << std::endl;
+    return -1;
+  }
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    in_a[i] = i + 1;
+    in_b[i] = i;
+    out[i] = 0xdeface;
+  }
+
+  // Pass arguments in the order of dma_memcpys in the mlir
+  invoke_data_movement(queues[0], &agents[0], out, in_a, in_b);
+
+  int errors = 0;
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    uint32_t s0 = in_a[i];
+    uint32_t s1 = in_b[i];
+    uint32_t d = out[i];
+    printf("s0[%d] = 0x%x\n", i, s0);
+    printf("s1[%d] = 0x%x\n", i, s1);
+    printf("d[%d] = 0x%x\n", i, d);
+    if (d != (s0 * s1)) {
+      errors++;
+      printf("mismatch 0x%x != 0x%x + 0x%x\n", d, s0, s1);
+    }
+  }
+
+  // destroying the queue
+  hsa_queue_destroy(queues[0]);
+
+  // Shutdown AIR and HSA
+  mlir_aie_deinit_libxaie(xaie);
+
+  if (!errors) {
+    printf("PASS!\n");
+    return 0;
+  } else {
+    printf("fail %d/%d.\n", errors, DMA_COUNT);
+    return -1;
+  }
+}
diff --git a/test/npu-xrt/add_one_two/CMakeLists.txt b/test/npu-xrt/add_one_two/CMakeLists.txt
new file mode 100644
index 0000000000..8aedc5111f
--- /dev/null
+++ b/test/npu-xrt/add_one_two/CMakeLists.txt
@@ -0,0 +1,71 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Xilinx Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ../../utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
\ No newline at end of file
diff --git a/test/npu-xrt/add_one_two/Makefile b/test/npu-xrt/add_one_two/Makefile
new file mode 100644
index 0000000000..1829431591
--- /dev/null
+++ b/test/npu-xrt/add_one_two/Makefile
@@ -0,0 +1,31 @@
+include ../makefile-common
+
+targetname = addOneObjfifo
+
+all: build/final.xclbin build/insts.txt
+
+build/one.xclbin: aie1.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --xclbin-kernel-name=ADDONE --xclbin-kernel-id=0x901 --xclbin-instance-name=ADDONEINST --no-aiesim  --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+build/two.xclbin: aie2.mlir build/one.xclbin
+	cd ${@D} && aiecc.py --xclbin-kernel-name=ADDTWO --xclbin-kernel-id=0x902 --xclbin-instance-name=ADDTWOINST --no-aiesim  --aie-generate-cdo --no-compile-host --xclbin-input=one.xclbin  --xclbin-name=${@F} \
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/two.xclbin build/insts.txt 
+	${powershell} ./$< -x build/two.xclbin -i build/insts.txt
+
+clean:
+	rm -rf build _build ${targetname}.exe
\ No newline at end of file