From ae6253fb46e607b733ce5e8eb50b54f83459eaa6 Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Sat, 24 Aug 2024 15:46:21 +0800
Subject: [PATCH] MNN:Sync: Sync Internal 2.9.4

---
 3rd_party/OpenCLHeaders/CL/cl2.hpp            |    4 +-
 CMakeLists.txt                                |   21 +-
 docs/index.rst                                |    1 +
 docs/inference/module.md                      |   37 +
 docs/tools/script.md                          |   70 +
 docs/transformers/diffusion.md                |   24 +-
 docs/transformers/llm.md                      |   71 +-
 express/Executor.cpp                          |   18 +-
 express/module/Module.cpp                     |   12 +-
 express/module/PipelineModule.cpp             |  195 ++-
 express/module/PipelineModule.hpp             |    2 +
 express/module/StaticModule.cpp               |  330 +++--
 express/module/StaticModule.hpp               |    6 +
 include/MNN/ErrorCode.hpp                     |   10 +
 include/MNN/Interpreter.hpp                   |   20 +
 include/MNN/MNNDefine.h                       |    2 +-
 include/MNN/expr/Executor.hpp                 |    7 +
 include/MNN/expr/Module.hpp                   |   10 +-
 project/ios/MNN.xcodeproj/project.pbxproj     |   44 +-
 project/ios/Playground/AppDelegate.mm         |   52 +-
 .../mobilenet_finetune/mobilenet_transfer.py  |    1 -
 .../imagenet_dataset.py                       |   97 --
 .../quant_aware_training.py                   |  125 --
 pymnn/pip_package/build_deps.py               |    2 +-
 pymnn/pip_package/pyproject.toml              |    5 +-
 pymnn/test/model_test.py                      |   13 +-
 pymnn/test/unit_test.py                       |    1 +
 pymnn/update_mnn_wrapper_assets.sh            |    2 +-
 source/backend/arm82/Arm82Functions.cpp       |    5 +
 .../arm64/low_memory/MNNCountMinMax_ARM82.S   |    2 +-
 .../low_memory/MNNPackedMatMulFP16_int4.S     |   12 +-
 .../low_memory/MNNPackedMatMulFP16_int8.S     |   12 +-
 .../MNNPackedMatMulRemainFP16_int4.S          |   36 +-
 .../MNNPackedMatMulRemainFP16_int8.S          |   36 +-
 .../backend/coreml/backend/CoreMLExecutor.mm  |   40 +-
 .../coreml/execution/CoreMLConvolution.cpp    |    4 +-
 source/backend/cpu/CPUAttention.cpp           |  491 +------
 source/backend/cpu/CPUAttention.hpp           |   32 +-
 source/backend/cpu/CPUBackend.cpp             |   10 +-
 source/backend/cpu/CPUBackend.hpp             |    2 +-
 source/backend/cpu/CPUConvolution.cpp         |  105 +-
 source/backend/cpu/CPUConvolution.hpp         |   19 +-
 .../backend/cpu/CPUConvolutionDepthwise.cpp   |    2 +-
 source/backend/cpu/CPUDeconvolution.cpp       |   12 +-
 source/backend/cpu/CPUDeconvolution.hpp       |   15 +-
 .../backend/cpu/CPUDeconvolutionDepthwise.cpp |    2 +-
 source/backend/cpu/CPUDepthwiseConvInt8.cpp   |    2 +-
 source/backend/cpu/KVCacheManager.cpp         |  467 ++++++
 source/backend/cpu/KVCacheManager.hpp         |  129 ++
 .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S    |   21 +-
 source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S |   16 +-
 .../backend/cpu/arm/arm64/MNNBilinearLineC8.S |    3 +
 .../cpu/arm/arm64/MNNBilinearSampleC8.S       |    6 +-
 .../MNNGemmInt8AddBiasScale_ARMV86_Unit.S     |    1 +
 .../backend/cpu/compute/CommonOptFunction.cpp |   84 +-
 .../backend/cpu/compute/CommonOptFunction.h   |   11 +
 .../cpu/compute/ConvInt8TiledExecutor.cpp     |  376 +++--
 .../cpu/compute/ConvInt8TiledExecutor.hpp     |   12 +-
 .../backend/cpu/compute/ConvInt8Winograd.cpp  |  135 +-
 .../cpu/compute/ConvolutionFloatFactory.cpp   |   16 +-
 .../cpu/compute/DeconvolutionWithStride.cpp   |    4 +-
 .../compute/DenseConvolutionTiledExecutor.cpp |   97 +-
 .../backend/cpu/compute/GemmInt8Executor.cpp  |   25 +-
 .../backend/cpu/compute/GemmInt8Executor.hpp  |    2 +-
 .../cpu/compute/IdstConvolutionInt8.cpp       |    4 +-
 .../compute/SparseConvInt8TiledExecutor.cpp   |   23 +-
 .../compute/SparseConvInt8TiledExecutor.hpp   |    4 +-
 .../cuda/execution/ConvCutlassExecution.cu    |   10 +-
 .../cuda/execution/ConvDepthWiseExecution.cu  |   42 +-
 .../cuda/execution/ConvImplicitExecution.cu   |   18 +-
 .../cuda/execution/ConvWinogradExecution.cu   |   58 +-
 .../execution/DeconvSingleInputExecution.cu   |   20 +-
 .../bf16/ConvCutlassBf16Execution.cu          |    2 +-
 .../weight_only_quant/ConvFpAIntBExecution.cu |   18 +-
 .../backend/hiai/execution/NPUConvolution.cpp |    2 +-
 .../execution/NPUConvolutionDepthwise.cpp     |    4 +-
 source/backend/metal/MetalConvolution.mm      |   14 +-
 source/backend/metal/MetalConvolution1x1.mm   |   22 +-
 .../backend/metal/MetalConvolutionCommon.hpp  |    2 +-
 .../backend/metal/MetalConvolutionCommon.mm   |   86 +-
 .../metal/MetalConvolutionDepthwise.mm        |    8 +-
 .../backend/metal/MetalConvolutionWinograd.mm |    4 +-
 source/backend/metal/MetalDeconvolution.mm    |    6 +-
 .../nnapi/execution/NNAPIConvolution.cpp      |    2 +-
 source/backend/opencl/CMakeLists.txt          |    4 +-
 source/backend/opencl/core/OpenCLBackend.cpp  |   24 +-
 source/backend/opencl/core/OpenCLGemmTune.cpp |   21 +-
 .../opencl/core/OpenCLRunningUtils.cpp        |   16 +
 .../opencl/core/OpenCLRunningUtils.hpp        |    4 +
 .../opencl/core/runtime/OpenCLRuntime.cpp     |   26 +
 .../opencl/core/runtime/OpenCLRuntime.hpp     |    7 +
 .../opencl/core/runtime/OpenCLWrapper.cpp     |   37 -
 .../opencl/core/runtime/OpenCLWrapper.hpp     |    9 +-
 .../execution/buffer/CastBufExecution.cpp     |    1 +
 .../execution/buffer/ConvBufExecution.cpp     |  232 ++-
 .../execution/buffer/ConvBufExecution.hpp     |    6 +
 .../buffer/ConvBufLowMemoryExecution.cpp      |  124 +-
 .../buffer/ConvBufLowMemoryExecution.hpp      |    5 +-
 .../execution/buffer/ConvBufWinograd.cpp      |    2 +-
 .../buffer/ConvSubgroupBufExecution.cpp       |   24 +-
 .../execution/buffer/DeconvBufExecution.cpp   |    2 +-
 .../buffer/DepthwiseConvBufExecution.cpp      |    2 +-
 .../DepthwiseConvSubgroupBufExecution.cpp     |   10 +-
 .../buffer/GroupNormBufExecution.cpp          |    3 +
 .../execution/buffer/MatmulBufExecution.cpp   |   46 +-
 .../buffer/StrassenMatmulOpenCLComputor.cpp   |  470 ++++++
 .../buffer/StrassenMatmulOpenCLComputor.hpp   |   67 +
 .../execution/cl/buffer_convert_quant.cl      |  229 +--
 .../execution/cl/gemm_quant_batch_buf.cl      |  293 ++--
 .../opencl/execution/cl/gemv_conv1x1_buf.cl   |  451 +-----
 .../opencl/execution/cl/groupnorm_buf.cl      |   14 +-
 .../opencl/execution/cl/matmul_params_buf.cl  |  224 +--
 .../opencl/execution/cl/opencl_program.cc     | 1269 +++++++----------
 .../opencl/execution/cl/opencl_source_map.hpp |    6 +
 .../execution/cl/strassen_binary_buf.cl       |  101 ++
 .../opencl/execution/image/ConvExecution.cpp  |    2 +-
 .../image/ConvLowMemoryExecution.cpp          |  128 +-
 .../image/ConvLowMemoryExecution.hpp          |    2 +
 .../opencl/execution/image/ConvWinograd.cpp   |    2 +-
 .../execution/image/DeconvExecution.cpp       |    2 +-
 .../image/DepthwiseConvExecution.cpp          |    2 +-
 .../image/DepthwiseDeconvExecution.cpp        |    2 +-
 source/backend/opencl/schema/CLCache.fbs      |    6 +
 .../opencl/schema/current/CLCache_generated.h |  144 +-
 .../tensorrt/execution/TRTConvolution.cpp     |    2 +-
 .../tensorrt/execution/TRTDeconvolution.cpp   |    4 +-
 .../execution/TRTDepthwiseConvolution.cpp     |    4 +-
 .../execution/TRTDepthwiseDeconvolution.cpp   |    6 +-
 source/backend/vulkan/CMakeLists.txt          |    2 +-
 .../vulkan/buffer/backend/VulkanBackend.cpp   |    9 +-
 .../buffer/execution/VulkanConvolution.cpp    |    6 +-
 .../buffer/execution/VulkanDeconvolution.cpp  |    9 +-
 .../buffer/execution/VulkanDeconvolution.hpp  |    2 +-
 .../vulkan/buffer/execution/VulkanUnary.cpp   |    2 +
 .../backend/vulkan/component/VulkanDevice.cpp |   29 +-
 .../backend/vulkan/component/VulkanDevice.hpp |    3 +-
 .../vulkan/component/VulkanInstance.cpp       |   37 +-
 .../vulkan/component/VulkanInstance.hpp       |    5 -
 .../vulkan/component/VulkanPipeline.cpp       |    3 +-
 .../vulkan/component/VulkanQueryPool.cpp      |    2 +-
 .../vulkan/image/backend/VulkanBackend.cpp    |    5 +-
 .../vulkan/image/compiler/AllShader.cpp       | 1131 +++++++--------
 .../vulkan/image/compiler/VulkanShaderMap.cpp |    2 +
 .../vulkan/image/compiler/makeshader.py       |    3 +-
 .../vulkan/image/execution/VulkanArgMax.cpp   |  129 ++
 .../vulkan/image/execution/VulkanArgMax.hpp   |   40 +
 .../vulkan/image/execution/VulkanBinary.cpp   |    3 +
 .../image/execution/VulkanConvolution.cpp     |    2 +-
 .../image/execution/VulkanDeconvolution.cpp   |    9 +-
 .../image/execution/VulkanDeconvolution.hpp   |    2 +-
 .../VulkanDeconvolutionDepthwise.cpp          |    7 +-
 .../VulkanDeconvolutionDepthwise.hpp          |    2 +-
 .../vulkan/image/execution/VulkanRaster.cpp   |    3 +
 .../vulkan/image/execution/VulkanUnary.cpp    |    2 +
 .../vulkan/image/execution/glsl/argmax.comp   |   51 +
 .../vulkan/image/execution/glsl/avgpool.comp  |    2 +-
 .../image/execution/glsl/binaryImage.comp     |    2 +-
 .../image/execution/glsl/blit_image.comp      |    2 +-
 .../execution/glsl/convolutionDepthwise.comp  |   12 +-
 .../glsl/convolutionDepthwiseMali.comp        |   12 +-
 .../image/execution/glsl/deconvCol2Im.comp    |    4 +-
 .../image/execution/glsl/deconvIm2Col.comp    |    6 +-
 .../glsl/deconvolutionDepthwise.comp          |    8 +-
 .../image/execution/glsl/fill_image.comp      |    2 +-
 .../image/execution/glsl/gemm16x16.comp       |    8 +-
 .../execution/glsl/gridSampleBilinear.comp    |    6 +-
 .../execution/glsl/gridSampleNearest.comp     |    6 +-
 .../vulkan/image/execution/glsl/im2col.comp   |    4 +-
 .../image/execution/glsl/im2col1x1.comp       |    4 +-
 .../image/execution/glsl/imageTonc4hw4.comp   |    2 +-
 .../image/execution/glsl/imageTonchw.comp     |    2 +-
 .../vulkan/image/execution/glsl/macro.json    |    3 +
 .../image/execution/glsl/matmul_input.comp    |    2 +-
 .../image/execution/glsl/matmul_output.comp   |    2 +-
 .../vulkan/image/execution/glsl/maxpool.comp  |    2 +-
 .../image/execution/glsl/nc4hw4toimage.comp   |    2 +-
 .../image/execution/glsl/nchwToimage.comp     |    2 +-
 .../image/execution/glsl/packAsImage4x4.comp  |    2 +-
 .../execution/glsl/preluWithChannel.comp      |    6 +-
 .../vulkan/image/execution/glsl/relu.comp     |    4 +-
 .../vulkan/image/execution/glsl/relu6.comp    |    4 +-
 .../image/execution/glsl/resizeBilinear.comp  |    4 +-
 .../image/execution/glsl/resizeNearest.comp   |    4 +-
 .../image/execution/glsl/roipooling.comp      |    2 +-
 .../vulkan/image/execution/glsl/scale.comp    |    2 +-
 .../image/execution/glsl/unPackImage4x4.comp  |    2 +-
 .../image/execution/glsl/unaryImage.comp      |    2 +-
 .../glsl/winogradTransformDest2_3_1.comp      |    6 +-
 .../glsl/winogradTransformSource2_3_1.comp    |    6 +-
 .../backend/vulkan/image/shaders/AllShader.h  |    4 +
 .../backend/vulkan/runtime/VulkanRuntime.cpp  |   54 +-
 .../backend/vulkan/runtime/VulkanRuntime.hpp  |    3 +-
 source/backend/vulkan/vulkan/vulkan_core.h    |    9 +-
 source/core/Backend.cpp                       |   16 -
 source/core/Backend.hpp                       |   11 +-
 source/core/BufferAllocator.cpp               |    4 +-
 source/core/ConvolutionCommon.cpp             |   91 +-
 source/core/ConvolutionCommon.hpp             |    6 +-
 source/core/FileLoader.hpp                    |   27 +-
 source/core/IDSTDecoder.hpp                   |   84 +-
 source/core/Interpreter.cpp                   |   11 +-
 source/core/MNNFileUtils.cpp                  |  284 ++++
 source/core/MNNFileUtils.h                    |  182 +++
 source/core/OpCommonUtils.cpp                 |   24 +-
 source/core/Pipeline.cpp                      |   30 +-
 source/core/Pipeline.hpp                      |    1 +
 source/core/Schedule.cpp                      |   10 +-
 source/core/Schedule.hpp                      |    3 +
 source/core/Session.cpp                       |   29 +-
 source/core/Session.hpp                       |    1 +
 source/geometry/GeometryComputer.hpp          |    1 +
 source/geometry/GeometryComputerUtils.cpp     |   10 +
 .../geometry/GeometryConv2DBackPropFilter.cpp |    2 +-
 source/geometry/GeometryReverseSequence.cpp   |    4 +-
 source/shape/SizeComputer.cpp                 |    2 +-
 source/utils/InitNet.cpp                      |   90 +-
 source/utils/InitNet.hpp                      |    2 +-
 test.sh                                       |   33 +-
 test/MNNTestSuite.cpp                         |    1 +
 test/core/FileUtilsTest.cpp                   |  320 +++++
 test/core/IDSTTest.cpp                        |   12 +-
 test/expr/MemoryIncrease.cpp                  |   75 +
 test/expr/ModuleShapeInfer.cpp                |  108 ++
 test/expr/ReverseSequenceTest.cpp             |   30 +-
 test/grad/BinaryGradTest.cpp                  |    3 +
 test/grad/GridSampleGradTest.cpp              |    3 +-
 test/grad/PReLUGradTest.cpp                   |    3 +-
 test/op/ConvInt8Test.cpp                      |    1 +
 test/op/ResizeTest.cpp                        |    6 +-
 test/op/ReverseTest.cpp                       |   22 +
 .../optimizer/merge/ConvertMatMulToConv2D.cpp |   20 +-
 .../source/optimizer/onnxextra/OnnxClip.cpp   |   13 +-
 .../onnxextra/OnnxDeQuantizeLinear.cpp        |    6 +-
 .../source/optimizer/onnxextra/OnnxEinsum.cpp |    8 +-
 .../onnxextra/OnnxQuantizeLinear.cpp          |    4 +-
 .../tflitextra/ConvTranposeTflite.cpp         |   52 +
 .../source/tflite/ConvolutionTflite.cpp       |   16 +-
 .../converter/source/tflite/CustomTflite.cpp  |   74 +
 tools/cpp/ExprDebug.hpp                       |    6 +-
 tools/cpp/LoRA.cpp                            |    3 +-
 tools/quantization/calibration.cpp            |   30 +-
 tools/script/apply_gptq.py                    |   37 +-
 tools/script/apply_lora.py                    |  156 ++
 tools/script/arm_assembly.py                  |   30 +-
 tools/script/convertOnnxTest.py               |    1 +
 tools/script/convertTfTest.py                 |    1 +
 tools/script/convertTfliteTest.py             |    1 +
 tools/script/convertTorchTest.py              |    1 +
 tools/script/modelTest.py                     |    1 +
 tools/script/testPTQ.py                       |   12 +-
 tools/train/register.py                       |   44 +
 tools/train/source/demo/MobilenetV2Utils.cpp  |   32 +-
 tools/train/source/demo/MobilenetV2Utils.hpp  |    4 +-
 tools/train/source/demo/demoMain.cpp          |    2 +-
 tools/train/source/demo/mobilenetV2Train.cpp  |   83 +-
 tools/train/source/grad/BinaryGrad.cpp        |   10 +-
 tools/train/source/grad/BroadcastToGrad.cpp   |   10 +-
 tools/train/source/grad/ConcatGrad.cpp        |   13 +-
 tools/train/source/grad/ConvGrad.cpp          |   16 +-
 tools/train/source/grad/GatherGrad.cpp        |   12 +-
 tools/train/source/grad/GradOPRegister.cpp    |   65 +
 tools/train/source/grad/GridSampleGrad.cpp    |   12 +-
 tools/train/source/grad/InterpGrad.cpp        |   12 +-
 tools/train/source/grad/LoopGrad.cpp          |   12 +-
 tools/train/source/grad/MatMulGrad.cpp        |   12 +-
 .../train/source/grad/MatrixBandPartGrad.cpp  |   12 +-
 tools/train/source/grad/OpGrad.cpp            |    9 +
 tools/train/source/grad/OpGrad.hpp            |    6 +
 tools/train/source/grad/PermuteGrad.cpp       |   12 +-
 tools/train/source/grad/PoolGrad.cpp          |   12 +-
 tools/train/source/grad/RasterGrad.cpp        |   12 +-
 tools/train/source/grad/ReduceGrad.cpp        |   12 +-
 tools/train/source/grad/ReluGrad.cpp          |   12 +-
 tools/train/source/grad/RenderGrad.cpp        |   12 +-
 tools/train/source/grad/ReshapeGrad.cpp       |   12 +-
 tools/train/source/grad/RoiAlignGrad.cpp      |   12 +-
 tools/train/source/grad/RoiPoolGrad.cpp       |   12 +-
 tools/train/source/grad/ScaleGrad.cpp         |   12 +-
 tools/train/source/grad/SelectGrad.cpp        |   12 +-
 tools/train/source/grad/SeluGrad.cpp          |   12 +-
 tools/train/source/grad/SliceGrad.cpp         |   12 +-
 tools/train/source/grad/SoftmaxGrad.cpp       |   12 +-
 tools/train/source/grad/StridedSliceGrad.cpp  |   12 +-
 tools/train/source/grad/TensorConvertGrad.cpp |   12 +-
 tools/train/source/grad/TopKV2Grad.cpp        |   12 +-
 tools/train/source/grad/UnaryGrad.cpp         |   12 +-
 tools/train/source/grad/ZeroGrad.cpp          |   12 +-
 tools/train/source/models/MobilenetV2.cpp     |   45 +-
 tools/train/source/models/MobilenetV2.hpp     |    2 +-
 transformers/diffusion/main.cpp               |   32 +-
 transformers/diffusion/pipeline.cpp           |   80 +-
 transformers/diffusion/pipeline.hpp           |   17 +-
 transformers/llm/config.json                  |   10 +-
 transformers/llm/engine/CMakeLists.txt        |   30 +-
 transformers/llm/engine/include/llm/llm.hpp   |   11 +-
 transformers/llm/engine/llm_demo.cpp          |   20 +-
 transformers/llm/engine/src/llm.cpp           |   88 +-
 transformers/llm/engine/src/llmconfig.hpp     |   12 +-
 transformers/llm/engine/src/tokenizer.cpp     |   13 +-
 299 files changed, 7931 insertions(+), 4556 deletions(-)
 create mode 100644 docs/tools/script.md
 delete mode 100644 pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py
 delete mode 100644 pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py
 create mode 100644 source/backend/cpu/KVCacheManager.cpp
 create mode 100644 source/backend/cpu/KVCacheManager.hpp
 create mode 100644 source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
 create mode 100644 source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp
 create mode 100644 source/backend/opencl/execution/cl/strassen_binary_buf.cl
 create mode 100644 source/backend/vulkan/image/execution/VulkanArgMax.cpp
 create mode 100644 source/backend/vulkan/image/execution/VulkanArgMax.hpp
 create mode 100644 source/backend/vulkan/image/execution/glsl/argmax.comp
 create mode 100644 source/core/MNNFileUtils.cpp
 create mode 100644 source/core/MNNFileUtils.h
 create mode 100644 test/core/FileUtilsTest.cpp
 create mode 100644 test/expr/ModuleShapeInfer.cpp
 create mode 100644 tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp
 create mode 100644 tools/script/apply_lora.py
 create mode 100644 tools/train/register.py
 create mode 100644 tools/train/source/grad/GradOPRegister.cpp

diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
index 491285264..4db4f7cf6 100644
--- a/3rd_party/OpenCLHeaders/CL/cl2.hpp
+++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -805,9 +805,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
 #define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
-#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
 #define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
 #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
 #define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)
 
 #define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
@@ -5229,7 +5229,6 @@ class Image3DGL : public Image3D
 };
 #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120
 /*! \class ImageGL
  * \brief general image interface for GL interop.
  * We abstract the 2D and 3D GL images into a single instance here
@@ -5308,7 +5307,6 @@ class ImageGL : public Image
         return *this;
     }
 };
-#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
 
 
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 006ae131f..f117f9d1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -673,6 +673,15 @@ IF(MNN_TENSORRT)
   list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
 ENDIF()
 
+IF(MNN_BUILD_LLM)
+    # add_definitions(-DMNN_BUILD_LLM)
+    include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
+    IF(NOT MNN_SEP_BUILD)
+      list(APPEND MNN_TARGETS llm)
+      list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:llm>)
+    ENDIF()
+ENDIF()
+
 IF(MNN_SEP_BUILD)
   add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS})
   target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
@@ -744,13 +753,7 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
   ENDIF()
   target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
 ENDIF()
-IF(MNN_BUILD_LLM)
-    # add_definitions(-DMNN_BUILD_LLM)
-    include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt)
-    IF(NOT MNN_SEP_BUILD)
-      target_sources(MNN PRIVATE $<TARGET_OBJECTS:llm>)
-    ENDIF()
-ENDIF()
+
 
 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
 # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
@@ -761,9 +764,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
 else()
 endif()
 if (NOT MNN_BUILD_SHARED_LIBS)
-    if(APPLE)
-        set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         # Static-link will not replace thread-related weak symbol in glibc with strong symbol
         # in pthread library, so we need use --whole-archive to pthread
         # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
diff --git a/docs/index.rst b/docs/index.rst
index 8c97f2410..827a85235 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -82,6 +82,7 @@
    tools/compress
    tools/visual
    tools/python
+   tools/script
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/inference/module.md b/docs/inference/module.md
index 22347a576..7ec90a8a4 100644
--- a/docs/inference/module.md
+++ b/docs/inference/module.md
@@ -200,6 +200,9 @@ MNN::Express::ExecutorScope scope(executor);
 module_thread.reset();
 ```
 
+## 多线程
+Module 的创建与运行依赖其所绑定的 Executor ，若不指定，则为全局 Executor ，并非线程安全。在多线程创建 Module 或进行推理时，会竞争全局 Executor 的资源，需要上锁或绑定不同的 Executor 。
+
 ## 调试
 
 Module API 也支持使用回调函数进行调试，与[runSessionWithCallBack](session.html#id19)相似。示例代码：
@@ -232,6 +235,40 @@ Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), s
 std::vector<VARP> outputs  = user_module->onForward(inputs);
 ```
 
+## 预推理分离模式
+对于满足 Interpreter-Session 运行条件的模型，若用户希望分离预推理（形状计算，几何计算，资源申请，策略搜索）与推理（内容计算）过程，可以设置预推理分离模式。示例代码如下：
+
+```cpp
+std::shared_ptr<Module> net(Module::load({"x"}, {"y"}, (const uint8_t*)buffer.data(), buffer.size()), Module::destroy);
+// 预推理分离模式
+auto code = net->traceOrOptimize(Interpreter::Module_Forward_Seperate);
+if (0 != code) {
+    // 若模型不支持预推理分离，需要还原设定
+    net->traceOrOptimize(Interpreter::Module_Forward_Combine);
+}
+
+/*预推理开始*/
+x = _Input({1, 3, 2, 2}, NCHW, halide_type_of<int>());
+auto input = x->writeMap<int>();
+y = net->onForward({x})[0];
+auto output = y->readMap<int>();
+
+/*预推理结束，获取输入和输出的数据指针*/
+
+/*内容计算*/
+/*
+Fill input
+*/
+
+// 输入空数组，表示仅进行推理
+net1->onForward({});
+
+/*
+Use output
+*/
+
+```
+
 ## 示例代码
 完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件：
 - `pictureRecognition_module.cpp` 使用`Module`执行图像分类，使用`ImageProcess`进行前处理，`Expr`进行后处理
diff --git a/docs/tools/script.md b/docs/tools/script.md
new file mode 100644
index 000000000..551b8e7b1
--- /dev/null
+++ b/docs/tools/script.md
@@ -0,0 +1,70 @@
+# 脚本工具
+一些功能性脚本，提供各种功能。
+
+## apply_gptq.py
+将GPTQ的权重写入到量化的MNN权重中。
+
+### 用法
+```
+usage: apply_gptq.py [-h] --mnn_graph MNN_GRAPH --mnn_weight MNN_WEIGHT --gptq_tensor GPTQ_TENSOR
+
+apply_gptq
+
+options:
+  -h, --help            show this help message and exit
+  --mnn_graph MNN_GRAPH
+                        mnn graph json path.
+  --mnn_weight MNN_WEIGHT
+                        mnn weight file path.
+  --gptq_tensor GPTQ_TENSOR
+                        gptq tensor path.
+```
+
+### 参数
+- MNN_GRAPH: 模型计算图的json文件，获取方法：`./MNNDump2Json model.mnn model.json`
+- MNN_WEIGHT:  模型的权重文件，如：`gptq.mnn.weight`
+- GPTQ_TENSOR: GPTQ量化后的权重文件，`model.safetensor`
+
+### 示例
+使用该脚本生成gptq量化的权重`gptq.mnn.weight`
+```sh
+cd build
+./MNNDump2Json model.mnn model.json
+cp model.mnn.weight gptq.mnn.weight
+python ../tools/script/apply_gptq.py --mnn_graph model.json --mnn_weight gptq.mnn.weight --gptq_tensor model.safetensor
+```
+
+## apply_lora.py
+
+合并base模型的计算图和lora模型的权重文件，生成新的计算图。
+
+### 用法
+```sh
+usage: apply_lora.py [-h] --base BASE --lora LORA [--scale SCALE] [--fuse FUSE] [--out OUT]
+
+apply_lora
+
+options:
+  -h, --help     show this help message and exit
+  --base BASE    base model json path.
+  --lora LORA    lora dir path or *.safetensors path.
+  --scale SCALE  lora scale: `alpha/r`.
+  --fuse FUSE    fuse A and B.
+  --out OUT      out file name.
+```
+
+### 参数
+- BASE: base.json, base模型计算图的json文件，获取方法：`./MNNDump2Json base.mnn base.json`
+- LORA: lora权重文件夹或者lora权重的safetensors
+- SCALE: lora权重的scale, `lora_alpha / lora_r`, 一般为4.0
+- FUSE: 是否将lora_A与lora_B合并成为一个lora权重，合并后模型较大
+- OUT: 生成新的计算图文件名，默认为`lora.json`，转换为模型：`./MNNRevert2Buffer lora.json lora.mnn`
+
+### 示例
+使用该脚本生成lora对应的模型`lora.mnn`, 用法: [LoRA](../transformers/llm.html#lora)
+```sh
+cd build
+./MNNDump2Json base.mnn base.json
+python ../tools/script/apply_lora.py --base base.json --lora lora_dir
+./MNNRevert2Buffer lora.json lora.mnn
+```
\ No newline at end of file
diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
index 32d790a26..70e64766b 100644
--- a/docs/transformers/diffusion.md
+++ b/docs/transformers/diffusion.md
@@ -35,9 +35,10 @@ conda activate ldm
 ```
 ./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz
 ```
-2. 实现denoiser从onnx模型 -> mnn模型
+2. 实现denoiser unet从onnx模型 -> mnn模型
 ```
 ./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz
+注意：对于非OpenCL后端推理，需要去掉--transformerFuse。
 ```
 3. 实现decoder从onnx模型 -> mnn模型
 ```
@@ -60,19 +61,26 @@ cd mnn_path/project/android/build
 ```
 ## 运行Diffusion Demo
 ```
-./diffusion_demo <resource_path> <model_type> <output_image_name> <input_text>
+./diffusion_demo <resource_path> <model_type> <output_image_name> <memory_mode> <backend_type> <input_text>
 ```
 其中，resource_path 就是mnn模型文件的路径，除了mnn文件，还需要:
 1. 将MNN目录transformers/diffusion/scheduler/alphas.txt文件拷贝到该文件夹下。
-2. 针对stable-diffusion-v1-5模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
+2. 针对stable-diffusion-v1-5/chilloutmix模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。
 3. 针对Taiyi-Stable-Diffusion模型需要将huggingfacetokenizer目录下vocab.txt拷贝到该文件夹中。
-4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5模型设为0，如果是Taiyi-Stable-Diffusion模型设为1。
+4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5/chilloutmix模型设为0，如果是Taiyi-Stable-Diffusion模型设为1。
 5. output_image_name是生成图片的名字，默认图片位置在当前运行目录下。
-6. input_text是文生图的prompt，如果是stable-diffusion-v1-5模型建议英文prompt，如果是Taiyi-Stable-Diffusion建议中文prompt。
+6. memory_mode代表设备是否内存足够，设为0表示内存节约模式(demo中每个模型使用前等待初始化，用完释放)，1代表内存足够模式(所有模式启动时全初始化完，用时无需等待初始化)。
+7. backend_type代表选择的运行后端。
+8. input_text是文生图的prompt，如果是stable-diffusion-v1-5/chilloutmix模型建议英文prompt，如果是Taiyi-Stable-Diffusion建议中文prompt。
 
 运行指令例如: 
 ```
-./diffusion_demo mnn_sd1.5_path 0 demo.jpg "a cute cat"
-./diffusion_demo mnn_chilloutmix_path 0 demo.jpg "a pure girl"
-./diffusion_demo mnn_taiyi_path 1 demo.jpg "一只可爱的猫"
+./diffusion_demo mnn_sd1.5_path 0 demo.jpg 0 3 "a cute cat"
+./diffusion_demo mnn_chilloutmix_path 0 demo.jpg 0 3 "a pure girl"
+./diffusion_demo mnn_taiyi_path 1 demo.jpg 0 3 "一只可爱的猫"
 ```
+## FAQ
+1. Demo运行报错、段错误，怎么解决？
+- 常见错误可能是设备内存不足，通常支持opencl fp16的设备需要保证3GB以上的内存，不支持fp16则需要6GB以上显存了。
+2. 使用其他后端，出现报错，什么原因？
+- 目前其他后端暂不支持transformer插件算子，需要在onnx->mnn模型转换阶段，去掉--transformerFuse。
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
index 2358548c6..5e77ab0cb 100644
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@@ -110,7 +110,7 @@ options:
 
 ### 编译
 
-[从源码编译](../compile/tools.html#id4)
+[从源码编译](../compile/other.html#id4)
 
 ### 使用
 #### 运行时配置
@@ -151,7 +151,7 @@ options:
     - 3: 使用非对称8bit量化存储key，使用fp8格式寸处value
 - 硬件配置
   - backend_type: 推理使用硬件后端类型，默认为：`"cpu"`
-  - thread_num: 推理使用硬件线程数，默认为：`4`
+  - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
   - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
   - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
 
@@ -201,4 +201,69 @@ options:
 ./llm_demo model_dir/llm.mnn
 ## 针对prompt中的每行进行回复
 ./llm_demo model_dir/llm.mnn prompt.txt
-```
\ No newline at end of file
+```
+
+#### GPTQ权重加载
+- 使用脚本生成GPTQ模型权重，用法参考: [apply_gptq.py](../tools/script.html#apply-gptq-py)
+- 创建`gptq.json`配置文件
+  ```json
+  {
+      "llm_model": "model.mnn",
+      "llm_weight": "gptq.mnn.weight",
+  }
+  ```
+
+
+#### LoRA权重加载
+- 使用脚本生成lora模型，用法参考: [apply_lora.py](../tools/script.html#apply-lora-py)
+- lora模型使用
+  - 直接加载lora模型使用，创建`lora.json`配置文件
+  ```json
+  {
+      "llm_model": "lora.mnn",
+      "llm_weight": "base.mnn.weight",
+  }
+  ```
+  - 运行时选择并切换lora模型
+  ```cpp
+  // 创建并加载base模型
+  std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
+  llm->load();
+  // 使用同一个对象，在多个lora模型之间选择性使用，不可以并发使用
+  {
+      // 在基础模型的基础上添加`lora_1`模型，模型的索引为`lora_1_idx`
+      size_t lora_1_idx = llm->apply_lora("lora_1.mnn");
+      llm->response("Hello lora1"); // 使用`lora_1`模型推理
+      // 添加`lora_2`模型，并使用
+      size_t lora_2_idx = llm->apply_lora("lora_2.mnn");
+      llm->response("Hello lora2"); // 使用`lora_2`模型推理
+      // 通过索引选择`lora_1`作为llm对象当前使用的模型
+      llm->select_module(lora_1_idx);
+      llm->response("Hello lora1"); // 使用`lora_1`模型推理
+      // 释放加载的lora模型
+      llm->release_module(lora_1_idx);
+      llm->release_module(lora_2_idx);
+      // 选择使用基础模型
+      llm->select_module(0);
+      llm->response("Hello base"); // 使用`base`模型推理
+  }
+  // 使用多个对象，可以并发的加载使用多个lora模型
+  {
+      std::mutex creat_mutex;
+      auto chat = [&](const std::string& lora_name) {
+          MNN::BackendConfig bnConfig;
+          auto newExe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
+          ExecutorScope scope(newExe);
+          Llm* current_llm = nullptr;
+          {
+              std::lock_guard<std::mutex> guard(creat_mutex);
+              current_llm = llm->create_lora(lora_name);
+          }
+          current_llm->response("Hello");
+      };
+      std::thread thread1(chat, "lora_1.mnn");
+      std::thread thread2(chat, "lora_2.mnn");
+      thread1.join();
+      thread2.join();
+  }
+  ```
\ No newline at end of file
diff --git a/express/Executor.cpp b/express/Executor.cpp
index f6b85765c..437d72df6 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -48,7 +48,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
             std::shared_ptr<Runtime> bn(creator->onCreate(info));
             mRuntimes[mAttr->firstType] = bn;
         } else {
-            firstIter->second->onReset(numberThread, &config);
+            firstIter->second->onReset(numberThread, &config, true);
         }
     } else {
         auto creator = MNNGetExtraRuntimeCreator(type);
@@ -69,7 +69,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
             std::shared_ptr<Runtime> bn(creator->onCreate(info));
             mRuntimes[mAttr->firstType] = bn;
         } else {
-            firstIter->second->onReset(numberThread, &config);
+            firstIter->second->onReset(numberThread, &config, true);
         }
     }
     _refreshRuntime();
@@ -147,10 +147,6 @@ static std::shared_ptr<Executor>* gExecutor = nullptr;
 std::shared_ptr<Executor> Executor::getGlobalExecutor() {
     std::call_once(gInitFlag, [&]() {
         auto creator = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
-#ifdef MNN_BUILD_MINI
-        SizeComputerSuite::init();
-        GeometryComputer::init();
-#endif
         Backend::Info info;
         info.type = MNN_FORWARD_CPU;
         info.numThread = 1;
@@ -158,7 +154,9 @@ std::shared_ptr<Executor> Executor::getGlobalExecutor() {
         RuntimeHint hint;
         hint.memoryAllocatorType = 0;// Defer
         bn->setRuntimeHint(hint);
-        gExecutor = new std::shared_ptr<Executor>(new Executor(bn, MNN_FORWARD_CPU, 1));
+        static std::shared_ptr<Executor> executorStatic;
+        executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1));
+        gExecutor = &executorStatic;
     });
     return *gExecutor;
 }
@@ -254,6 +252,10 @@ void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) {
 void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
     mInside->modes.setHint(mode, value);
 }
+void Executor::RuntimeManager::setExternalPath(std::string path, int type) {
+    mInside->modes.setExternalPath(path, type);
+}
+
 bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) {
     // Only support get memory
     switch (code) {
@@ -320,7 +322,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
         }
         originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
     } else {
-        iter->second->onReset(compute.numThread, compute.user);
+        iter->second->onReset(compute.numThread, compute.user, false);
     }
     res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
     res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index 82172bfbd..4ba49c27a 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -481,10 +481,18 @@ Module* Module::extract(std::vector<Express::VARP> inputs, std::vector<Express::
     return new PipelineModule(inputs, outputs);
 }
 int Module::traceOrOptimize(Interpreter::SessionMode stage) {
+    auto code = this->onOptimize(stage);
+    if (code != 0) {
+        // Has Error
+        return code;
+    }
     for (auto& m : mChildren) {
-        m->traceOrOptimize(stage);
+        code = m->traceOrOptimize(stage);
+        if (code != 0) {
+            return code;
+        }
     }
-    return this->onOptimize(stage);
+    return code;
 }
 
 
diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
index 932ae6daa..ee9f4d496 100644
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@@ -17,6 +17,7 @@
 #include "core/Backend.hpp"
 #include "core/WrapExecution.hpp"
 #include "core/FileLoader.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "utils/InitNet.hpp"
 #include "RuntimeAttr.hpp"
 #include "geometry/GeometryComputer.hpp"
@@ -197,8 +198,27 @@ std::vector<int> PipelineModule::countOutputReference(std::vector<int> outputInd
     }
     return countResult;
 }
+int PipelineModule::onOptimize(Interpreter::SessionMode stage) {
+    if (stage == Interpreter::Module_Forward_Separate) {
+        if (mSubModules.size() == 1 && std::get<0>(mSubModules[0])->type() == "StaticModule") {
+            mSeperate = true;
+            return 0;
+        }
+        return NOT_SUPPORT;
+    } else if (stage == Interpreter::Module_Forward_Combine) {
+        mSeperate = false;
+    }
+    return 0;
+}
 
 std::vector<VARP> PipelineModule::onForward(const std::vector<VARP>& inputs) {
+    if (mSeperate && inputs.empty()) {
+        for (int index = 0; index < mSubModules.size(); ++index) {
+            auto& m = mSubModules[index];
+            std::get<0>(m)->onForward(inputs);
+        }
+        return {};
+    }
     std::vector<VARP> mStack(mStackSize);
     for (int i = 0; i < mInitVars.size(); ++i) {
         mStack[i + mInputSize] = mInitVars[i];
@@ -386,6 +406,93 @@ static std::vector<int> _collectNeededOps(const MNN::Net* net, const std::set<in
     return ops;
 }
 
+static std::vector<int> _findBreakIndex(const SubModuleInfo& info, const Net* net, std::shared_ptr<Schedule::ScheduleInfo> sharedConst) {
+    // 0: not used, 1: const, 2: output
+    std::vector<uint8_t> constMask(sharedConst->allTensors.size(), 0);
+    for (int i=0; i<sharedConst->allTensors.size(); ++i) {
+        if(sharedConst->allTensors[i].get() != nullptr) {
+            constMask[i] = 1;
+        }
+    }
+    for (int v = 0; v < info.opList.size(); ++v) {
+        auto op = net->oplists()->GetAs<Op>(info.opList[v]);
+        if (nullptr == op->outputIndexes()) {
+            continue;
+        }
+        bool isConst = true;
+        if (nullptr != op->inputIndexes()) {
+            for (int i=0; i<op->inputIndexes()->size(); ++i) {
+                auto index = op->inputIndexes()->data()[i];
+                if (constMask[index]) {
+                    continue;
+                }
+                if (OpCommonUtils::opNeedContent(op, i)) {
+                    isConst = false;
+                    break;
+                }
+            }
+        }
+        if (isConst) {
+            for (int i=0; i<op->outputIndexes()->size(); ++i) {
+                auto index = op->outputIndexes()->data()[i];
+                constMask[index] = 1;
+            }
+        }
+    }
+    std::vector<int> res;
+    // Check Break Index
+    for (int v = 0; v < info.opList.size(); ++v) {
+        auto op = net->oplists()->GetAs<Op>(info.opList[v]);
+        if (nullptr == op->outputIndexes() || nullptr == op->inputIndexes()) {
+            continue;
+        }
+        int inputNum = op->inputIndexes()->size();
+        auto dims = SizeComputer::needInputContent(op, inputNum);
+        for (auto index : dims) {
+            if (index < inputNum) {
+                if (constMask[op->inputIndexes()->data()[index]] != 1) {
+                    res.emplace_back(v);
+                    break;
+                }
+            }
+        }
+    }
+    return res;
+}
+static std::vector<SubModuleInfo> _splitSubModuleForShapeConst(const std::vector<SubModuleInfo>& origin, const Net* net, std::shared_ptr<Schedule::ScheduleInfo> sharedConst) {
+    std::vector<SubModuleInfo> res;
+    for (auto& m : origin) {
+        if (m.isBreak) {
+            res.emplace_back(std::move(m));
+            continue;
+        }
+        auto breakIndexes = _findBreakIndex(m, net, sharedConst);
+        if (breakIndexes.size() > 0) {
+            int current = 0;
+            for (auto breakIndex : breakIndexes) {
+                // Split
+                if (breakIndex > current) {
+                    SubModuleInfo m0;
+                    m0.opList.insert(m0.opList.begin(), m.opList.begin() + current, m.opList.begin() + breakIndex);
+                    res.emplace_back(std::move(m0));
+                }
+                SubModuleInfo m1;
+                m1.opList = {m.opList[breakIndex]};
+                res.emplace_back(std::move(m1));
+                current = breakIndex + 1;
+            }
+            if (current < m.opList.size()) {
+                SubModuleInfo m2;
+                m2.opList.insert(m2.opList.begin(), m.opList.begin() + current, m.opList.end());
+                res.emplace_back(std::move(m2));
+            }
+        } else {
+            res.emplace_back(std::move(m));
+        }
+    }
+    return res;
+}
+
 static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferStorage> bufferStorage, const std::set<int>& inputIndexes, const std::set<int>& outputIndexes, const std::set<int>& noComputeIndexes, std::shared_ptr<Schedule::ScheduleInfo> sharedConst) {
     std::vector<SubModuleInfo> submodule;
     auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
@@ -400,8 +507,6 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
             // TODO: Don't need split segment
             if (current.opList.size() > 0) {
                 // Not empty
-                // Init tensormask
-                _computeTensorMask(current, net);
                 submodule.emplace_back(std::move(current));
             }
             SubModuleInfo controlOp;
@@ -421,13 +526,14 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
         current.opList.emplace_back(i);
     }
     if (!current.opList.empty()) {
-        _computeTensorMask(current, net);
         submodule.emplace_back(std::move(current));
     }
+    submodule = _splitSubModuleForShapeConst(submodule, net, sharedConst);
     for (int moduleIndex=0; moduleIndex < submodule.size(); ++moduleIndex) {
         auto& m = submodule[moduleIndex];
         // Compute input / output
         if (!m.isBreak) {
+            _computeTensorMask(m, net);
             for (int i=0; i<m.tensorMask.size(); ++i) {
                 if (0 == m.tensorMask[i]) {
                     continue;
@@ -533,7 +639,7 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
     scheduleInfo.defaultBackend = sharedConst->defaultBackend;
     scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
     scheduleInfo.allTensors = sharedConst->allTensors;
-    scheduleInfo.validForResize = initTensors(scheduleInfo.allTensors, net);
+    scheduleInfo.validForResize = initTensors(scheduleInfo.allTensors, net, info.opList.data(), info.opList.size());
     std::vector<Schedule::OpCacheInfo> oplists;
     std::vector<const Op*> ops;
     ops.reserve(info.opList.size());
@@ -633,10 +739,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
         modRuntime.userConfig = &rtMgr->getInside()->mConfig;
         modRuntime.compute.type      = modRuntime.rt.first.begin()->first;
         modRuntime.compute.numThread = 1;
-        // set allocator type
         modRuntime.rt.first.begin()->second->setRuntimeHint(rtMgr->getInside()->modes.runtimeHint);
-        // set winograd memory type
-        modRuntime.rt.second->setRuntimeHint(rtMgr->getInside()->modes.runtimeHint);
     }
     auto& rt = modRuntime.rt;
     auto firstRt = rt.first[modRuntime.compute.type];
@@ -687,16 +790,51 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
         MNN_ERROR("\n");
         return nullptr;
     }
-    for (auto index : noneedComputeIndexes) {
-        auto tensor = Tensor::clone(sharedConst->allTensors[index].get());
-        auto constVar = Variable::create(Expr::create(tensor, true));
-        initVars.insert(std::make_pair(index, constVar));
-    }
     auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
     std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
     for (int i=0; i<subModulesInfo.size(); ++i) {
         subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
     }
+    if (!permitCodeGen) {
+        // Prereplace const tensor
+        auto curBackend = sharedConst->constReplaceBackend.get();
+        if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
+            for (auto& t : sharedConst->allTensors) {
+                if (nullptr == t.get()) {
+                    continue;
+                }
+                auto des = TensorUtils::getDescribe(t.get());
+                if (des->isMutable) {
+                    continue;
+                }
+                if (!WrapExecution::needWrap(t.get(), curBackend)) {
+                    continue;
+                }
+                if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
+                    continue;
+                }
+                if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
+                    continue;
+                }
+                std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
+                auto outDes = TensorUtils::getDescribe(wrapTensor.get());
+                outDes->usage = des->usage;
+                auto tempRes = WrapExecution::allocAndCopy(curBackend, t.get(), wrapTensor.get());
+                if (!tempRes) {
+                    continue;
+                }
+                outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
+                WrapExecution::copyReplaceTensor(wrapTensor.get(), t.get());
+            }
+        }
+        // Clear CPU Const memory
+        rt.second->onGabageCollect(0);
+    }
+    for (auto index : noneedComputeIndexes) {
+        auto tensor = Tensor::clone(sharedConst->allTensors[index].get());
+        auto constVar = Variable::create(Expr::create(tensor, true));
+        initVars.insert(std::make_pair(index, constVar));
+    }
     auto result = new PipelineModule;
     result->mInputSize = inputs.size();
     /**
@@ -751,39 +889,6 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
     }
     result->registerModel(subModules);
     result->mSharedConst = sharedConst;
-    if (!permitCodeGen) {
-        // Prereplace const tensor
-        auto curBackend = sharedConst->constReplaceBackend.get();
-        if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
-            for (auto& t : sharedConst->allTensors) {
-                if (nullptr == t.get()) {
-                    continue;
-                }
-                auto des = TensorUtils::getDescribe(t.get());
-                if (des->isMutable) {
-                    continue;
-                }
-                if (!WrapExecution::needWrap(t.get(), curBackend)) {
-                    continue;
-                }
-                if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
-                    continue;
-                }
-                if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
-                    continue;
-                }
-                std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
-                auto outDes = TensorUtils::getDescribe(wrapTensor.get());
-                outDes->usage = des->usage;
-                auto tempRes = WrapExecution::allocAndCopy(curBackend, t.get(), wrapTensor.get());
-                if (!tempRes) {
-                    continue;
-                }
-                outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
-                WrapExecution::copyReplaceTensor(wrapTensor.get(), t.get());
-            }
-        }
-    }
     return result;
 }
 
diff --git a/express/module/PipelineModule.hpp b/express/module/PipelineModule.hpp
index 198f06063..9b28c0c96 100644
--- a/express/module/PipelineModule.hpp
+++ b/express/module/PipelineModule.hpp
@@ -49,6 +49,7 @@ class PipelineModule : public Module {
 
     MNN_PUBLIC PipelineModule(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs,
                    const Transformer& transformFunction = {});
+    int onOptimize(Interpreter::SessionMode stage) override;
 private:
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, std::shared_ptr<BufferStorage> bufferStorage, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config, std::map<std::string, SubGraph>& subGraphMap);
     static void _createSubGraph(const MNN::Net* net, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config, std::map<std::string, SubGraph>& subGraphMap);
@@ -64,6 +65,7 @@ class PipelineModule : public Module {
     friend class NN;
     std::vector<VARP> mInitVars;
     std::shared_ptr<Schedule::ScheduleInfo> mSharedConst;
+    bool mSeperate = false;
 };
 } // namespace Express
 } // namespace MNN
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
index f382d2ff6..ec5fd2982 100644
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@@ -22,14 +22,41 @@
 namespace MNN {
 namespace Express {
 
+static const StaticModule* getStaticModule(const Module* m) {
+    if (m->type() == "StaticModule") {
+        return static_cast<const StaticModule*>(m);
+    }
+    if (m->getChildren().empty()) {
+        return nullptr;
+    }
+    return getStaticModule(m->getChildren()[0].get());
+}
+
 static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLINT
-    Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend) {
+                                                                       Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend, const Module* base = nullptr) {
+    std::map<const std::string, std::shared_ptr<Execution>> base_executions;
+    if (base != nullptr) {
+        // has base module
+        auto static_module = getStaticModule(base);
+        if (static_module) {
+            auto session = static_module->getSession();
+            std::vector<Schedule::OpCacheInfo> op_caches = session->getPipelineInfo(0).second;
+            for (auto& op_cache : op_caches) {
+                const auto& exe_cache = op_cache.executionCache;
+                for (const auto& exe_item : exe_cache) {
+                    if (exe_item.first->name()) {
+                        base_executions.insert(std::make_pair(exe_item.first->name()->str(), exe_item.second));
+                    }
+                }
+            }
+        }
+    }
     FileLoader loader(scheduleInfo.externalWeightPath.c_str());
     auto&& pipelineInfo = scheduleInfo.pipelineInfo[0].second;
     std::vector<std::shared_ptr<BufferStorage>> splitOps(pipelineInfo.size());
     for (int i = 0; i < pipelineInfo.size(); ++i) {
         auto& info = pipelineInfo[i];
-        auto op       = pipelineInfo[i].op;
+        auto op    = pipelineInfo[i].op;
         std::unique_ptr<OpT> op_table(op->UnPack());
         std::shared_ptr<Execution> exe;
         switch (op->type()) {
@@ -37,52 +64,68 @@ static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLIN
             case MNN::OpType_ConvInt8:
             case MNN::OpType_ConvolutionDepthwise:
             case MNN::OpType_Convolution: {
-                DataType type = DataType_DT_FLOAT;
-                auto conv2d = op->main_as_Convolution2D();
-                // Create Default Inputs and Outputs
-                auto tempInput = info.inputs[0];
-                auto tempOutput = info.outputs[0];
-                auto common = conv2d->common();
-                if (scheduleInfo.pipelineInfo[0].first.needComputeGeometry) {
-                    // Set default shape to create execution
-                    int ow = 2, oh = 2;
-                    int iw = (common->kernelX() - 1) * common->dilateX() + common->strideX() * (ow - 1) + 1;
-                    int ih = (common->kernelY() - 1) * common->dilateY() + common->strideY() * (oh - 1) + 1;
-                    TensorUtils::getDescribe(tempInput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
-                    tempInput->setLength(0, 1);
-                    tempInput->setLength(1, conv2d->common()->inputCount());
-                    tempInput->setLength(2, ih);
-                    tempInput->setLength(3, iw);
-                    TensorUtils::getDescribe(tempOutput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
-                    tempOutput->setLength(0, 1);
-                    tempOutput->setLength(1, conv2d->common()->outputCount());
-                    tempOutput->setLength(2, oh);
-                    tempOutput->setLength(3, ow);
-                    if (op->main_as_Convolution2D()->quanParameter()) {
-                        type = DataType_DT_INT8;
-                        int inputIdx = op->inputIndexes()->Get(0);
-                        auto& inputQuantAttr = TensorUtils::getDescribe(tempInput)->quantAttr;
-                        if (nullptr != inputQuantAttr.get()) {
-                            TensorUtils::getDescribe(tempInput)->type = DataType_DT_INT8;
+                if (!base_executions.empty() && op->name()) {
+                    auto iter = base_executions.find(op->name()->str());
+                    if (iter != base_executions.end()) {
+                        auto base_exe = iter->second.get();
+                        Execution* copyExecution = nullptr;
+                        base_exe->onClone(backend, op, &copyExecution);
+                        if (copyExecution == nullptr) {
+                            base_exe->onClone(backupBackend, op, &copyExecution);
                         }
-                        auto& outputQuantAttr = TensorUtils::getDescribe(tempOutput)->quantAttr;
-                        if (nullptr != outputQuantAttr.get()) {
-                            TensorUtils::getDescribe(tempOutput)->type = DataType_DT_INT8;
+                        if (copyExecution != nullptr && copyExecution->onClone(nullptr, op, nullptr)) {
+                            exe.reset(copyExecution);
                         }
                     }
                 }
-                std::shared_ptr<BufferStorage> tmpstorage;
-                exe.reset(OpCommonUtils::createExecutionWithExternal(backend, info.inputs, info.outputs, op, &loader, tmpstorage));
-                if (exe.get() == nullptr) {
-                    exe.reset(OpCommonUtils::createExecutionWithExternal(backupBackend, info.inputs, info.outputs, op, &loader, tmpstorage));
-                }
-                if (nullptr == exe) {
-                    break;
-                }
-                // The exe can't clone
-                if (!exe->onClone(nullptr, op, nullptr)) {
-                    exe = nullptr;
-                    break;
+                if (exe == nullptr) {
+                    DataType type = DataType_DT_FLOAT;
+                    auto conv2d = op->main_as_Convolution2D();
+                    // Create Default Inputs and Outputs
+                    auto tempInput = info.inputs[0];
+                    auto tempOutput = info.outputs[0];
+                    auto common = conv2d->common();
+                    if (scheduleInfo.pipelineInfo[0].first.needComputeGeometry) {
+                        // Set default shape to create execution
+                        int ow = 2, oh = 2;
+                        int iw = (common->kernelX() - 1) * common->dilateX() + common->strideX() * (ow - 1) + 1;
+                        int ih = (common->kernelY() - 1) * common->dilateY() + common->strideY() * (oh - 1) + 1;
+                        TensorUtils::getDescribe(tempInput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
+                        tempInput->setLength(0, 1);
+                        tempInput->setLength(1, conv2d->common()->inputCount());
+                        tempInput->setLength(2, ih);
+                        tempInput->setLength(3, iw);
+                        TensorUtils::getDescribe(tempOutput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;;
+                        tempOutput->setLength(0, 1);
+                        tempOutput->setLength(1, conv2d->common()->outputCount());
+                        tempOutput->setLength(2, oh);
+                        tempOutput->setLength(3, ow);
+                        if (op->main_as_Convolution2D()->quanParameter()) {
+                            type = DataType_DT_INT8;
+                            int inputIdx = op->inputIndexes()->Get(0);
+                            auto& inputQuantAttr = TensorUtils::getDescribe(tempInput)->quantAttr;
+                            if (nullptr != inputQuantAttr.get()) {
+                                TensorUtils::getDescribe(tempInput)->type = DataType_DT_INT8;
+                            }
+                            auto& outputQuantAttr = TensorUtils::getDescribe(tempOutput)->quantAttr;
+                            if (nullptr != outputQuantAttr.get()) {
+                                TensorUtils::getDescribe(tempOutput)->type = DataType_DT_INT8;
+                            }
+                        }
+                    }
+                    std::shared_ptr<BufferStorage> tmpstorage;
+                    exe.reset(OpCommonUtils::createExecutionWithExternal(backend, info.inputs, info.outputs, op, &loader, tmpstorage));
+                    if (exe.get() == nullptr) {
+                        exe.reset(OpCommonUtils::createExecutionWithExternal(backupBackend, info.inputs, info.outputs, op, &loader, tmpstorage));
+                    }
+                    if (nullptr == exe) {
+                        break;
+                    }
+                    // The exe can't clone
+                    if (!exe->onClone(nullptr, op, nullptr)) {
+                        exe = nullptr;
+                        break;
+                    }
                 }
                 if (OpParameter_Convolution2D == op_table->main.type) {
                     op_table->main.AsConvolution2D()->bias.clear();
@@ -148,12 +191,12 @@ static bool _reshapeTensor(Tensor* tensor, const Tensor* dims) {
     }
     return dirty;
 }
-static void _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, Schedule::TENSORCACHE* cacheTensor) {
+static bool _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, Schedule::TENSORCACHE* cacheTensor) {
     MNN_ASSERT(nullptr != tensor);
     bool dirty = _reshapeTensor(tensor, dims);
 
     if (!dirty) {
-        return;
+        return false;
     }
 
     tensor->buffer().dimensions = (int)dims->dimensions();
@@ -172,7 +215,7 @@ static void _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session,
             std::get<2>(*cacheTensor) = true;
         }
     }
-    session->setNeedResize();
+    return true;
 }
 void StaticModule::resetInputOutputs() {
     mPrevInputTensor.resize(mResource->mInputs.size());
@@ -196,6 +239,49 @@ void StaticModule::resetInputOutputs() {
             des->usage = Tensor::InsideDescribe::OUTPUT;
         }
     }
+    // Mask Geometry Compute Mid Tensor release able indexes
+    auto& infos = pipelineInfo;
+    for (auto& info : infos.second) {
+        info.releaseAbleInputs.clear();
+        if (info.type != Schedule::Type::CONSTANT) {
+            continue;
+        }
+        for (auto t : info.inputs) {
+            auto des = TensorUtils::getDescribe(t);
+            if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) {
+                des->useCount = 0;
+            }
+        }
+    }
+    for (auto& info : infos.second) {
+        for (auto t : info.inputs) {
+            auto des = TensorUtils::getDescribe(t);
+            if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) {
+                des->useCount++;
+            }
+        }
+    }
+    for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
+        mOutputTensors[i] = mSession->getTensor(mResource->mOutputs[mResource->mOutputFromTensor[i]]);
+        auto des = TensorUtils::getDescribe(mOutputTensors[i]);
+        if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) {
+            des->useCount ++;
+        }
+    }
+    for (auto& info : infos.second) {
+        if (info.type != Schedule::Type::CONSTANT) {
+            continue;
+        }
+        for (int v=0; v<info.inputs.size(); ++v) {
+            auto des = TensorUtils::getDescribe(info.inputs[v]);
+            if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) {
+                des->useCount--;
+                if (des->useCount == 0) {
+                    info.releaseAbleInputs.emplace_back(v);
+                }
+            }
+        }
+    }
 }
 
 StaticModule::StaticModule(std::vector<int> inputs,
@@ -233,7 +319,7 @@ StaticModule::StaticModule(std::vector<int> inputs,
         }
     }
     if (config.rearrange) {
-        mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get());
+        mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get(), config.base);
     } else {
         mResource->mBuffer = std::move(buffer);
     }
@@ -300,33 +386,12 @@ void StaticModule::onClearCache() {
         }
     }
 }
-
-std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
-
-    AUTOTIME;
-    std::vector<Express::VARP> outputs(mResource->mOutputNumbers);
-    for (auto& iter : mResource->mOutputFromInput) {
-        outputs[iter.first] = inputs[iter.second];
-    }
-    if (mResource->mOutputFromTensor.empty()) {
-        return outputs;
-    }
-    Variable::compute(inputs);
-#ifdef MNN_DUMP_MEMORY
-    auto rt = Executor::getRuntime();
-    auto mem = rt.second->onGetMemoryInMB();
-    for (auto iter : rt.first) {
-        if (iter.second.get() != rt.second.get()) {
-            mem += iter.second->onGetMemoryInMB();
-        }
-    }
-    FUNC_PRINT_ALL(mem, f);
-#endif
-
-    MNN_ASSERT(inputs.size() == mInputTensors.size());
+ErrorCode StaticModule::_resize(const std::vector<Express::VARP>& inputs) {
+    ErrorCode code = NO_ERROR;
     auto& pipelineInfo = mSession->getPipelineInfo(0);
     if (mResource->mModes.inputMode == Interpreter::Session_Input_User) {
         pipelineInfo.first.inputBackendChange = false;
+        bool needResize = mResource->mUseContentInputs;
         for (int i = 0; i < inputs.size(); ++i) {
             if (nullptr == mInputTensors[i]) {
                 continue;
@@ -366,6 +431,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             if (srcDes->tensorArrayAttr.get() != nullptr) {
                 // For tensorArray, don't need content
                 needCopy = false;
+                mSession->setNeedResize();
             }
             bool needMalloc;
             if (needCopy) {
@@ -390,18 +456,38 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             des->dimensionFormat = srcDes->dimensionFormat;
             des->tensorArrayAttr = srcDes->tensorArrayAttr;
             mInputTensors[i]->buffer().type = inputTensor->buffer().type;
-            _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), cacheTensor);
+            if (_resizeTensor(mInputTensors[i], inputTensor, mSession.get(), cacheTensor)) {
+                needResize = true;
+            }
             if (needMalloc) {
                 mSession->setNeedMalloc();
             }
         }
-        if (mResource->mUseContentInputs) {
+        if (needResize) {
             mSession->setNeedResize();
         }
-        auto code = mSession->resize();
-        if (NO_ERROR != code) {
-            FUNC_PRINT(code);
-            return {};
+        code = mSession->resize();
+        if (!needResize) {
+            // Check if output is used by other vars. If used, must realloc output to avoid the content dirty for output vars
+            // If resized, the output's memory will be all released in Session::resize, don't need clear here
+            for (auto& output : mOutputTensors) {
+                auto desOrigin = TensorUtils::getDescribeOrigin(output);
+                if ((!desOrigin->mContent->isMutable) || nullptr == desOrigin->mem.get()) {
+                    continue;
+                }
+                auto bn = desOrigin->getBackend();
+                if (nullptr == bn) {
+                    continue;
+                }
+                if (desOrigin->mContent.use_count() > 1 && desOrigin->mContent->usage != Tensor::InsideDescribe::CONSTANT) {
+                    desOrigin->mem = nullptr;
+                    auto res = bn->onAcquireBuffer(output, Backend::STATIC);
+                    if (!res) {
+                        return OUT_OF_MEMORY;
+                    }
+                    mSession->setNeedMalloc();
+                }
+            }
         }
     } else {
         // Resize
@@ -414,9 +500,11 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             auto des = TensorUtils::getDescribe(mInputTensors[i]);
             des->dimensionFormat = srcDes->dimensionFormat;
             mInputTensors[i]->buffer().type = inputTensor->buffer().type;
-            _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), nullptr);
+            if (_resizeTensor(mInputTensors[i], inputTensor, mSession.get(), nullptr)) {
+                mSession->setNeedResize();
+            }
         }
-        mSession->resize();
+        code = mSession->resize();
         // Copy
         for (int i = 0; i < inputs.size(); ++i) {
             if (nullptr == mInputTensors[i]) {
@@ -427,19 +515,10 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             mInputTensors[i]->copyFromHostTensor(inputTensor);
         }
     }
+    return code;
+}
 
-
-#ifdef LOG_VERBOSE
-    for (auto& inputTensor : mInputTensors) {
-        MNN_PRINT("static module, before run, input ptr:%p, hostPtr:%p,  shape:", inputTensor, inputTensor->host<void>());
-        inputTensor->printShape();
-        MNN_PRINT("\n");
-        auto shape = inputTensor->shape();
-    }
-    MNN_PRINT("staticmodule before run\n");
-#endif
-
-
+ErrorCode StaticModule::_execute() {
     ErrorCode code;
     if (mResource->mModes.callBackMode == Interpreter::Session_Debug) {
         auto globalExecutor = ExecutorScope::Current();
@@ -452,9 +531,58 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
     } else {
         code = mSession->run();
     }
+    return code;
+}
+
+std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
+
+    AUTOTIME;
+    std::vector<Express::VARP> outputs;
+    bool runResize = (!mShapeInferSeperate) || inputs.size() > 0;
+    bool runCompute = (!mShapeInferSeperate) || inputs.size() == 0;
+    if (runResize) {
+        outputs.resize(mResource->mOutputNumbers);
+        for (auto& iter : mResource->mOutputFromInput) {
+            outputs[iter.first] = inputs[iter.second];
+        }
+    }
+    if (mResource->mOutputFromTensor.empty()) {
+        return outputs;
+    }
+    Variable::compute(inputs);
+#ifdef MNN_DUMP_MEMORY
+    auto rt = Executor::getRuntime();
+    auto mem = rt.second->onGetMemoryInMB();
+    for (auto iter : rt.first) {
+        if (iter.second.get() != rt.second.get()) {
+            mem += iter.second->onGetMemoryInMB();
+        }
+    }
+    FUNC_PRINT_ALL(mem, f);
+#endif
+
+    ErrorCode code = NO_ERROR;
+    if (runResize) {
+        code = _resize(inputs);
+    }
+    if (NO_ERROR == code && runCompute) {
+        code = _execute();
+    }
     if (NO_ERROR != code) {
+        FUNC_PRINT(code);
         return {};
     }
+    if (!runResize) {
+        for (auto& var : mOutputVars) {
+            // Check if needed recopy
+            auto inside = var->expr().first->inside();
+            if (nullptr != inside->mHostTensor) {
+                inside->mOutputTensors[0]->copyToHostTensor(inside->mHostTensor);
+            }
+        }
+        return {};
+    }
+    auto& pipelineInfo = mSession->getPipelineInfo(0);
     for (int i = 0; i < mOutputTensors.size(); ++i) {
         auto tensor = Tensor::clone(mOutputTensors[i]);
         outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
@@ -469,7 +597,9 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
             outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
         }
     }
-
+    if (mShapeInferSeperate && runResize) {
+        mOutputVars = outputs;
+    }
 #ifdef MNN_INTERNAL_ENABLED
     auto glo = ExecutorScope::Current();
     float flops = 0.0f;
@@ -492,6 +622,7 @@ Module* StaticModule::clone(CloneContext* ctx) const {
     return this->cloneBaseTo(ctx, module);
 }
 int StaticModule::onOptimize(Interpreter::SessionMode stage) {
+    int res = 0;
     switch (stage) {
         case MNN::Interpreter::Session_Resize_Check:
             mSession->openResizeCheck();
@@ -499,10 +630,21 @@ int StaticModule::onOptimize(Interpreter::SessionMode stage) {
         case MNN::Interpreter::Session_Resize_Fix:
             mSession->fixResizeCache();
             break;
+        case MNN::Interpreter::Module_Forward_Separate:
+            if (mResource->mUseContentInputs || mResource->mModes.inputMode != Interpreter::Session_Input_User || mResource->mOutputFromTensor.empty()) {
+                res = NOT_SUPPORT;
+                break;
+            }
+            mShapeInferSeperate = true;
+            break;
+        case MNN::Interpreter::Module_Forward_Combine:
+            mOutputVars.clear();
+            mShapeInferSeperate = false;
+            break;
         default:
             break;
     }
-    return 0;
+    return res;
 }
 
 } // namespace Express
diff --git a/express/module/StaticModule.hpp b/express/module/StaticModule.hpp
index 3b5b8bb5d..582ae92fb 100644
--- a/express/module/StaticModule.hpp
+++ b/express/module/StaticModule.hpp
@@ -25,8 +25,12 @@ class StaticModule : public Module {
     virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
     virtual void onClearCache() override;
     virtual int onOptimize(Interpreter::SessionMode stage) override;
+    const Session* getSession() const { return mSession.get(); }
 
 private:
+    ErrorCode _resize(const std::vector<Express::VARP>& inputs);
+    ErrorCode _execute();
+
     StaticModule() = default;
     void resetInputOutputs();
 
@@ -52,6 +56,8 @@ class StaticModule : public Module {
     std::vector<std::pair<Tensor*, Backend*>> mPrevInputTensor;
     std::vector<Tensor*> mOutputTensors;
     std::shared_ptr<Resource> mResource;
+    bool mShapeInferSeperate = false;
+    std::vector<MNN::Express::VARP> mOutputVars;
 };
 }
 }
diff --git a/include/MNN/ErrorCode.hpp b/include/MNN/ErrorCode.hpp
index 7ee64e6be..4d40d60e4 100644
--- a/include/MNN/ErrorCode.hpp
+++ b/include/MNN/ErrorCode.hpp
@@ -28,6 +28,16 @@ enum ErrorCode {
     // Op Resize Error
     TENSOR_NOT_SUPPORT = 20,
     TENSOR_NEED_DIVIDE = 21,
+
+    // File error
+    FILE_CREATE_FAILED = 30,
+    FILE_REMOVE_FAILED = 31,
+    FILE_OPEN_FAILED   = 32,
+    FILE_CLOSE_FAILED  = 33,
+    FILE_RESIZE_FAILED = 34,
+    FILE_SEEK_FAILED   = 35,
+    FILE_NOT_EXIST     = 36,
+    FILE_UNMAP_FAILED  = 37
 };
 } // namespace MNN
 
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index 6debbe3f0..bac8fb341 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -163,6 +163,15 @@ class MNN_PUBLIC Interpreter {
         /** Dynamic Reisze Optimization */
         Session_Resize_Check = 14, // Open Trace for resize
         Session_Resize_Fix = 15, // Apply Resize Optimization
+        
+        /** Set for Module's traceOrOptimize API. 
+         Module_Forward_Seperate:
+         when inputs is not empty , Module's onForward will only infer shape and alloc memory.
+         when inputs is empty , Module's onForward will only runSession to compute content.
+         Default is Module_Forward_Combine
+         */
+        Module_Forward_Separate = 16,
+        Module_Forward_Combine = 17,
     };
     /**
      * @brief The API shoud be called before create session.
@@ -220,6 +229,17 @@ class MNN_PUBLIC Interpreter {
         // 2: Only quantize value cache, use fp8 quantization
         // 3: quantize both key and value cache as described above
         KVCACHE_QUANT_OPTIONS = 7,
+
+        // size limit of kvcache in memory (for a single layer)
+        // if the size of kvcache exceeds the limit, it will be moved to disk
+        KVCACHE_SIZE_LIMIT = 8,
+    };
+
+    enum ExternalPathType {
+        // Path of the kvcache directory
+        EXTERNAL_PATH_KVCACHE_DIR = 0,
+
+        // Other types ...
     };
 
     enum GeometryComputeMask {
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index ab84cd8f8..215939a99 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 3
+#define MNN_VERSION_PATCH 4
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp
index 3871827c9..367c15d03 100644
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@@ -103,6 +103,13 @@ class MNN_PUBLIC Executor {
          */
         void setCache(std::string cacheName);
         
+        /**
+         * @brief set the path of external files or directory
+         * @param path -- The path of a file or directory on disk
+         * @param type -- Type of the external path (see "enum ExternalPathType" in Interpreter.hpp)
+         */
+        void setExternalPath(std::string path, int type);
+        
         /**
          * @brief set external file.
          */
diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp
index 2436b3985..1e5562de8 100644
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@@ -17,6 +17,7 @@
 #include <MNN/MNNForwardType.h>
 
 namespace MNN {
+class Session;
 namespace Express {
 struct SubGraph;
 class MNN_PUBLIC Module {
@@ -47,7 +48,7 @@ class MNN_PUBLIC Module {
 
     void setParameter(Express::VARP parameter, int index);
     static Module* createEmpty(const std::vector<Express::VARP>& parameters);
-    
+
     struct BackendInfo {
         MNNForwardType type = MNN_FORWARD_CPU;
         BackendConfig* config = nullptr;
@@ -63,8 +64,11 @@ class MNN_PUBLIC Module {
         // The weights will be rearranged in a general way, so the best implementation
         // may not be adopted if `rearrange` is enabled.
         bool rearrange = false;
-        
+
         BackendInfo* backend = nullptr;
+
+        // base module
+        const Module* base = nullptr;
     };
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr);
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const Config* config = nullptr);
@@ -102,7 +106,6 @@ class MNN_PUBLIC Module {
 
         EXPRP getOrClone(const EXPRP expr);
         VARP getOrClone(const VARP var);
-
     private:
         bool mShareParams = false;
         std::unordered_map<const Expr*, EXPRP> mExprMap;
@@ -117,6 +120,7 @@ class MNN_PUBLIC Module {
     static void destroy(Module* m);
 
     int traceOrOptimize(Interpreter::SessionMode stage);
+    std::vector<std::shared_ptr<Module>> getChildren() const { return mChildren; }
 protected:
     virtual int onOptimize(Interpreter::SessionMode stage) {
         return 0;
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index 009adba67..f576703bf 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -743,6 +743,8 @@
 		9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9560EAD52BDE426A00C8D0B6 /* GeometryLayernorm.cpp */; };
 		956F52E12AB2D692004B13D9 /* ImageProcessUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 956F52E02AB2D692004B13D9 /* ImageProcessUtils.cpp */; };
 		956F52E32AB2D6A1004B13D9 /* ImageProcessUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 956F52E22AB2D6A1004B13D9 /* ImageProcessUtils.hpp */; };
+		95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */; };
+		95772DD02C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */; };
 		958375352A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S in Sources */ = {isa = PBXBuildFile; fileRef = 958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */; };
 		958B046429D2C89D00FC3AEF /* GemmInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */; };
 		958B046629D2C8AF00FC3AEF /* GemmInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */; };
@@ -1577,6 +1579,8 @@
 		9560EAD52BDE426A00C8D0B6 /* GeometryLayernorm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryLayernorm.cpp; sourceTree = "<group>"; };
 		956F52E02AB2D692004B13D9 /* ImageProcessUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageProcessUtils.cpp; sourceTree = "<group>"; };
 		956F52E22AB2D6A1004B13D9 /* ImageProcessUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ImageProcessUtils.hpp; sourceTree = "<group>"; };
+		95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM82.S; sourceTree = "<group>"; };
+		95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM86.S; sourceTree = "<group>"; };
 		958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; path = arm/arm64/MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; sourceTree = "<group>"; };
 		958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GemmInt8Executor.cpp; sourceTree = "<group>"; };
 		958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = GemmInt8Executor.hpp; sourceTree = "<group>"; };
@@ -1853,6 +1857,7 @@
 		488873A8215B639D0079B12E /* source */ = {
 			isa = PBXGroup;
 			children = (
+				CE482EF5288536DA007CD935 /* internal */,
 				4DF87C482887D3560003E2D4 /* calib3d */,
 				4D4CF4612760946500A36D9F /* imgproc */,
 				4D9A931B26255BDA00F9B43C /* coreml */,
@@ -2591,6 +2596,8 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
+				95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
 				4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
 				95CE1E002AC57F7600EFB51E /* MNNReluWithSlopeChannelInt8.S */,
 				CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */,
@@ -2884,16 +2891,19 @@
 				CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */,
 				4DE4E82C275E307B0016A916 /* cv in Headers */,
 				1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */,
+				CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */,
 				1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */,
 				C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */,
 				1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */,
 				1F501F872397BA5B004E8721 /* Matrix.h in Headers */,
 				CE8049AC2B31C65B009B422C /* CPULayerNorm.hpp in Headers */,
+				CECF8C5A299CACFD00D3875B /* WorkerThread.hpp in Headers */,
 				48C84B85250F711700EE7666 /* IfModule.hpp in Headers */,
 				4D9A937326255BDA00F9B43C /* CoreMLUnary.hpp in Headers */,
 				48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */,
 				4882C8B8241A22B800DAC168 /* OpCommonUtils.hpp in Headers */,
 				48608B54250632EC00CB1D71 /* GeometryComputer.hpp in Headers */,
+				CECF8C7A299CAD9400D3875B /* sha1.h in Headers */,
 				4894C6EC27016F7200D8BE79 /* CPUResizeCache.hpp in Headers */,
 				92FF04A623AA0BFB00AC97F6 /* FileLoader.hpp in Headers */,
 				48F34733273A7C8400C45394 /* ImageProcessFunction.hpp in Headers */,
@@ -2907,6 +2917,7 @@
 				48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */,
 				92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */,
 				4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */,
+				CECF8C85299CAD9400D3875B /* log_util.h in Headers */,
 				4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */,
 				4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */,
 				92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */,
@@ -2915,6 +2926,7 @@
 				1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */,
 				19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */,
 				489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */,
+				CECF8C86299CAD9400D3875B /* sds.h in Headers */,
 				1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */,
 				92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */,
 				4D9A935B26255BDA00F9B43C /* NeuralNetwork.pb-c.h in Headers */,
@@ -2935,8 +2947,10 @@
 				481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */,
 				4894C6EA27016F7200D8BE79 /* UnaryUtils.hpp in Headers */,
 				EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */,
+				CECF8C81299CAD9400D3875B /* log_util_imp.h in Headers */,
 				92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */,
 				4D9A935826255BDA00F9B43C /* FeatureTypes.pb-c.h in Headers */,
+				CECF8C7C299CAD9400D3875B /* hmac-sha.h in Headers */,
 				48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */,
 				950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */,
 				489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */,
@@ -2959,6 +2973,7 @@
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
 				92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
+				CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
 				4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
 				92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
 				4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */,
@@ -2996,6 +3011,7 @@
 				92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */,
 				92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */,
 				489D7A9F2550FDC900AD896A /* MetalConvolutionCommon.hpp in Headers */,
+				CECF8C80299CAD9400D3875B /* lz4.h in Headers */,
 				92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */,
 				489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */,
 				92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */,
@@ -3047,20 +3063,24 @@
 				92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */,
 				92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */,
 				92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
+				CECF8C88299CAD9400D3875B /* log_api.h in Headers */,
 				4A224A0D27D0C2D9000A9260 /* ConvolutionPackWinograd.hpp in Headers */,
 				4A224A0E27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.hpp in Headers */,
 				4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
 				48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
 				F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
+				CECF8C5B299CACFD00D3875B /* LogHelper.hpp in Headers */,
 				92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
 				482BFBCD28351BA1009210E4 /* ShaderMap.hpp in Headers */,
 				489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
+				CECF8C7F299CAD9400D3875B /* md5.h in Headers */,
 				92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
 				92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */,
 				92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */,
 				C43C8227251894F400A0FF84 /* Vec.hpp in Headers */,
 				4819FB1D24C138DF0050BD09 /* GeometryConvUtils.hpp in Headers */,
 				489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */,
+				CECF8C83299CAD9400D3875B /* log_define.h in Headers */,
 				C48CAE2628900C4A00271A6D /* ConvInt8Winograd.hpp in Headers */,
 				48F34730273A7C7300C45394 /* CPUImageProcess.hpp in Headers */,
 				489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */,
@@ -3281,6 +3301,7 @@
 				489D7A8A2550FDC900AD896A /* MetalConvolutionDepthwise.mm in Sources */,
 				48123003269EA83400EB7ABA /* ShapeUnique.cpp in Sources */,
 				92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */,
+				CECF8C5E299CACFD00D3875B /* WorkerThread.cpp in Sources */,
 				489D7A842550FDC900AD896A /* MetalBinary.mm in Sources */,
 				48747D6B245D9E33000B9709 /* GeometryFill.cpp in Sources */,
 				4819FB1F24C138DF0050BD09 /* GeometryConvUtils.cpp in Sources */,
@@ -3380,6 +3401,7 @@
 				48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */,
 				6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */,
 				48958781268EBA6F00EA01A7 /* CPUSegmentMean.cpp in Sources */,
+				CECF8C7B299CAD9400D3875B /* sha1.c in Sources */,
 				4D9A937026255BDA00F9B43C /* CoreMLUnary.cpp in Sources */,
 				92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */,
 				92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */,
@@ -3434,6 +3456,7 @@
 				92FF03CE23AA0B5A00AC97F6 /* CPUOPRegister.cpp in Sources */,
 				92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */,
 				4819FB2C24C1396A0050BD09 /* GeometryPoolGrad.cpp in Sources */,
+				CECF8C7E299CAD9400D3875B /* log_builder.cpp in Sources */,
 				92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */,
 				4D6D7FD12656891400F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
 				4D5662CC299B76ED0031C1A1 /* MNNMaxPoolInt8.S in Sources */,
@@ -3500,6 +3523,7 @@
 				92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
+				95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */,
 				92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */,
 				92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */,
@@ -3512,6 +3536,7 @@
 				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
+				CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
 				92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
 				92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
 				CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */,
@@ -3569,8 +3594,10 @@
 				92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */,
 				92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */,
 				92FF045B23AA0B7100AC97F6 /* ShapeShape.cpp in Sources */,
+				CECF8C87299CAD9400D3875B /* sds.c in Sources */,
 				9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */,
 				4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */,
+				CECF8C82299CAD9400D3875B /* log_api.cpp in Sources */,
 				92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */,
 				950B28E229F627E00002F454 /* MNNBinarySubInt8.S in Sources */,
 				950B28F029F627F70002F454 /* MNNBinarySubInt8.S in Sources */,
@@ -3580,6 +3607,7 @@
 				4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
 				CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
 				C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
+				CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
 				48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
 				92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
 				48A8A61A21D101DE00C2B9A7 /* Matrix_CV.cpp in Sources */,
@@ -3605,6 +3633,7 @@
 				92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
 				EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
 				92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
+				CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
 				92FF045623AA0B7100AC97F6 /* ShapeReshape.cpp in Sources */,
 				92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */,
@@ -3641,6 +3670,7 @@
 				92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
 				92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
+				CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */,
 				92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
 				952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
 				48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */,
@@ -3666,11 +3696,13 @@
 				92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
 				4D9A937926255BDA00F9B43C /* CoreMLRaster.cpp in Sources */,
 				48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */,
+				CECF8C84299CAD9400D3875B /* lz4.c in Sources */,
 				489D7A7E2550FDC900AD896A /* MNNMetalContext.mm in Sources */,
 				92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */,
 				92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */,
 				92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
 				92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
+				CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */,
 				92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */,
 				92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */,
 				CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */,
@@ -3720,6 +3752,7 @@
 				92FF043623AA0B7100AC97F6 /* ShapeSelect.cpp in Sources */,
 				92FF042B23AA0B7100AC97F6 /* ShapeOneHot.cpp in Sources */,
 				92FF043C23AA0B7100AC97F6 /* ShapeExpandDims.cpp in Sources */,
+				95772DD02C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S in Sources */,
 				92FF045723AA0B7100AC97F6 /* ShapeTranspose.cpp in Sources */,
 				92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */,
 				92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
@@ -4068,7 +4101,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = Q48UX93J22;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -4155,7 +4188,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = Q48UX93J22;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
@@ -4170,7 +4203,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdedddddd;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -4202,7 +4235,7 @@
 				MARKETING_VERSION = 1.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.ddddddddd;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -4234,7 +4267,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				MARKETING_VERSION = 1.0;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.ddddddddd;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -4284,3 +4317,4 @@
 	};
 	rootObject = 0F1465AE1FA18D1000F9860A /* Project object */;
 }
+
diff --git a/project/ios/Playground/AppDelegate.mm b/project/ios/Playground/AppDelegate.mm
index d073b12a8..7efc31eae 100644
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@@ -11,8 +11,10 @@
 #include <MNN/MNNForwardType.h>
 #include <MNN/Interpreter.hpp>
 #import <MNN/expr/Executor.hpp>
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
 #import "benchmark.h"
-#define TEST_WORKMODE 0
+#define TEST_WORKMODE 2
 @implementation AppDelegate
 
 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
@@ -44,11 +46,55 @@ - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(
     auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
     CFRelease(url);
     auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
-    auto res     = std::string(cstring) + "/models/mobilenet_v2_auth.mnn";
+    auto res     = std::string(cstring) + "/model/MobileNet/v1/mobilenet_v1.caffe.mnn";
+    CFRelease(string);
     
     MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str());
+    interpreter->setSessionHint(MNN::Interpreter::GEOMETRY_COMPUTE_MASK, 0);
     MNN::ScheduleConfig config;
-    interpreter->createSession(config);
+    config.type = MNN_FORWARD_NN;
+    config.numThread = 1;
+    MNN::BackendConfig bnC;
+    bnC.precision = MNN::BackendConfig::Precision_Normal;
+    config.backendConfig = &bnC;
+    auto session = interpreter->createSession(config);
+    auto inpDev = interpreter->getSessionInput(session, nullptr);
+    auto outDev = interpreter->getSessionOutput(session, nullptr);
+    auto input = std::shared_ptr<MNN::Tensor>(new MNN::Tensor(inpDev));
+    auto output = std::shared_ptr<MNN::Tensor>(new MNN::Tensor(outDev));
+    auto inputHost = input->host<float>();
+    int inputSize = input->elementSize();
+    for (int v=0; v<inputSize; ++v) {
+        inputHost[v] = (float)rand() / RAND_MAX;
+    }
+    auto outputHost = output->host<float>();
+    int outputSize = output->elementSize();
+
+    for (int i=0; i<2; ++i) {
+        inpDev->copyFromHostTensor(input.get());
+        interpreter->runSession(session);
+        outDev->copyToHostTensor(output.get());
+        float sum = 0.0f;
+        float maxv = 0.0f;
+        float minv = 0.0f;
+        for (int v=0; v<outputSize; ++v) {
+            float value = outputHost[v];
+            maxv = ALIMAX(maxv, value);
+            minv = ALIMIN(minv, value);
+            sum += value;
+        }
+        float mean = sum / (float)outputSize;
+        MNN_PRINT("Size:%d, Max:%f, Min:%f, Avg:%f\n", outputSize, maxv, minv, mean);
+    }
+    {
+        AUTOTIME;
+        for (int i=0; i<10; ++i) {
+            inpDev->copyFromHostTensor(input.get());
+            interpreter->runSession(session);
+            outDev->copyToHostTensor(output.get());
+        }
+    }
+    delete interpreter;
 #endif
     return YES;
 }
diff --git a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
index 79c3bf9fd..a4017f913 100644
--- a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
+++ b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
@@ -14,7 +14,6 @@ def load_feature_extractor(model_file):
     output_var = var_dict['MobilenetV2/Logits/AvgPool']
     # 'False' means the parameters int this module will not update during training
     feature_extractor = nn.load_module([input_var], [output_var], False)
-    feature_extractor = nn.FixModule(feature_extractor)  # fix feature extractor
 
     return feature_extractor
 
diff --git a/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py b/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py
deleted file mode 100644
index a2f4cb0a3..000000000
--- a/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import numpy as np
-from PIL import Image
-import MNN
-F = MNN.expr
-
-
-# adapted from pycaffe
-def load_image(filename, color=True):
-    """
-    Load an image converting from grayscale or alpha as needed.
-
-    Parameters
-    ----------
-    filename : string
-    color : boolean
-        flag for color format. True (default) loads as RGB while False
-        loads as intensity (if image is already grayscale).
-
-    Returns
-    -------
-    image : an image with type np.float32 in range [0, 1]
-        of size (H x W x 3) in RGB or
-        of size (H x W x 1) in grayscale.
-    """
-    img = Image.open(filename)
-    img = np.array(img)
-    if img.ndim == 2:
-        img = img[:, :, np.newaxis]
-        if color:
-            img = np.tile(img, (1, 1, 3))
-    elif img.shape[2] == 4:
-        img = img[:, :, :3]
-    return img
-
-
-def center_crop(image_data, crop_factor):
-    height, width, channels = image_data.shape
-
-    h_size = int(height * crop_factor)
-    h_start = int((height - h_size) / 2)
-    h_end = h_start + h_size
-
-    w_size = int(width * crop_factor)
-    w_start = int((width - w_size) / 2)
-    w_end = w_start + w_size
-
-    cropped_image = image_data[h_start:h_end, w_start:w_end, :]
-
-    return cropped_image
-
-
-def resize_image(image, shape):
-    im = Image.fromarray(image)
-    im = im.resize(shape)
-    resized_image = np.array(im)
-
-    return resized_image
-
-
-class ImagenetDataset(MNN.data.Dataset):
-    def __init__(self, image_folder, val_txt, training_dataset=True):
-        super(ImagenetDataset, self).__init__()
-        self.is_training_dataset = training_dataset
-
-        self.image_folder = image_folder
-
-        if self.is_training_dataset:
-            f = open(val_txt)
-            self.image_list = f.readlines()[0:10000]
-            f.close()
-        else:
-            f = open(val_txt)
-            self.image_list = f.readlines()[10000:50000]
-            f.close()
-
-    def __getitem__(self, index):
-        image_name = self.image_folder + self.image_list[index].split(' ')[0]
-        image_label = int(self.image_list[index].split(' ')[1]) + 1  # align with tf mobilenet labels, we need add 1
-
-        image_data = load_image(image_name)
-        image_data = center_crop(image_data, 0.85)
-        image_data = resize_image(image_data, (224, 224))
-
-        image_data = (image_data - 127.5) / 127.5
-
-        dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC)
-        dl = F.const([image_label], [], F.data_format.NHWC, F.dtype.int)
-        # first for inputs, and may have many inputs, so it's a list
-        # second for targets, also, there may be more than one targets
-        return [dv], [dl]
-
-    def __len__(self):
-        # size of the dataset
-        if self.is_training_dataset:
-            return 10000
-        else:
-            return 40000
diff --git a/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py b/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py
deleted file mode 100644
index ac6e32f05..000000000
--- a/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from __future__ import print_function
-import time
-import argparse
-import numpy as np
-import MNN
-from imagenet_dataset import ImagenetDataset
-nn = MNN.nn
-F = MNN.expr
-
-
-def test_func(net, test_dataloader):
-    net.train(False)
-    test_dataloader.reset()
-
-    correct = 0
-    total = 0
-    for i in range(test_dataloader.iter_number):
-        example = test_dataloader.next()
-        input_data = example[0]
-        output_target = example[1]
-        data = input_data[0]  # which input, model may have more than one inputs
-        label = output_target[0]  # also, model may have more than one outputs
-
-        predict = net(data)
-        predict = F.argmax(predict, 1)
-        predict = np.array(predict.read())
-        label = np.array(label.read())
-        correct += (np.sum(label == predict))
-        total += label.size
-
-        if (i+1) % 10 == 0:
-            print("test iteration", i+1, ", accuracy: ", correct / total * 100, "%")
-
-    print("test acc: ", correct * 100.0 / test_dataloader.size, "%")
-
-
-def train_func(net, train_dataloader, opt, num_classes):
-    net.train(True)
-    train_dataloader.reset()
-
-    t0 = time.time()
-    # for i in range(train_dataloader.iter_number):
-    for i in range(100): # actually, in our full experiment, we only need 3K images using ILSVRC2012 training dataset
-        example = train_dataloader.next()
-        input_data = example[0]
-        output_target = example[1]
-        data = input_data[0]  # which input, model may have more than one inputs
-        label = output_target[0]  # also, model may have more than one outputs
-
-        predict = net.forward(data)
-        target = F.one_hot(F.cast(label, F.int), num_classes, 1, 0)
-        loss = nn.loss.cross_entropy(predict, target)
-        opt.step(loss)
-
-        if i % 10 == 0:
-            print("train loss: ", loss.read())
-
-    t1 = time.time()
-    cost = t1 - t0
-    print("Epoch cost: %.3f s." % cost)
-    F.save(net.parameters, "temp.mobilenet.snapshot")
-
-
-def demo():
-    '''
-    demo for quant-aware-training using tf mobilenet v2.
-    the dataset used is the ILSVRC2012 validation dataset which has 50000 images
-    10000 for training (actually we only need 3K in our standard experiment using ILSVRC2012 training dataset)
-    40000 for testing
-    '''
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_file", type=str, required=True,\
-        help="mobilenet MNN model file")
-    parser.add_argument("--val_image_path", type=str, required=True,\
-        help="path to ILSVRC2012 val images")
-    parser.add_argument("--val_txt", type=str, required=True,\
-                        help="path to ILSVRC2012 val.txt")
-
-    args = parser.parse_args()
-
-    model_file = args.model_file
-    image_path = args.val_image_path
-    val_txt = args.val_txt
-
-    train_dataset = ImagenetDataset(image_path, val_txt, True)
-    test_dataset = ImagenetDataset(image_path, val_txt, False)
-    train_dataloader = MNN.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
-    test_dataloader = MNN.data.DataLoader(test_dataset, batch_size=10, shuffle=False)
-
-    m = F.load_as_dict(model_file)
-
-    inputs_outputs = F.get_inputs_and_outputs(m)
-    for key in inputs_outputs[0].keys():
-        print('input names:\t', key)
-    for key in inputs_outputs[1].keys():
-        print('output names:\t', key)
-
-    # get inputs and outputs
-    inputs = [m['input']]
-    outputs = [m['MobilenetV2/Predictions/Reshape_1']]
-
-    net = nn.load_module(inputs, outputs, True)
-
-    # turn net to quant-aware-training module
-    nn.compress.train_quant(net, quant_bits=8)
-
-    opt = MNN.optim.SGD(net, 1e-5, 0.9, 0.00004)
-
-    num_classes = 1001
-
-    for epoch in range(5):
-        train_func(net, train_dataloader, opt, num_classes)
-
-        # save model
-        file_name = '%d.mobilenet.mnn' % epoch
-        net.train(False)
-        predict = net.forward(F.placeholder([1, 3, 224, 224], F.NC4HW4))
-        print("Save to " + file_name)
-        F.save([predict], file_name)
-
-        test_func(net, test_dataloader)
-
-
-if __name__ == "__main__":
-    demo()
diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py
index 8d0297c90..320975bf5 100644
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@@ -118,7 +118,7 @@ def build_deps():
         os.system('cmake ' + extra_opts + '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
             -DMNN_BUILD_SHARED_LIBS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\
             -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-            .. && make MNN MNNConvertDeps -j4')
+            .. && make MNN MNNConvertDeps -j64')
 ################################################################################
 # Building dependent libraries
 ################################################################################
diff --git a/pymnn/pip_package/pyproject.toml b/pymnn/pip_package/pyproject.toml
index 57cc5503b..c178a4ebc 100644
--- a/pymnn/pip_package/pyproject.toml
+++ b/pymnn/pip_package/pyproject.toml
@@ -11,12 +11,11 @@ build-backend = "setuptools.build_meta"
 
 [tool.cibuildwheel]
 test-skip = [
-    "*",
     "cp36-*",
     "*-macosx_arm64"
 ]
 test-requires = [
-    "opencv-python",
+    "opencv-python==4.6.0.66",
     "numpy",
     "torch"
 ]
@@ -42,8 +41,6 @@ repair-wheel-command = ""
 build = "cp*-manylinux*"
 skip = "pp*"
 before-all = [
-    "sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*",
-    "sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://mirrors.aliyun.com|g' /etc/yum.repos.d/CentOS-*",
     "yum install -y wget",
     "pushd pymnn/pip_package",
     "python3 build_deps.py --torch",
diff --git a/pymnn/test/model_test.py b/pymnn/test/model_test.py
index d25f7f619..1da4e85f0 100644
--- a/pymnn/test/model_test.py
+++ b/pymnn/test/model_test.py
@@ -90,10 +90,16 @@ def createTensor(tensor, file=''):
         data = loadtxt(file, shape, dtype)
     return MNN.Tensor(shape, tensor.getDataType(), data, tensor.getDimensionType())
 
-def compareTensor(tensor, file, atol=5e-2):
+def compareTensor(tensor, file, tolerance=5e-2):
     outputNumpyData = tensor.getNumpyData()
     expectNumpyData = loadtxt(file, tensor.getShape())
-    return np.allclose(outputNumpyData, expectNumpyData, atol=atol)
+    max_abs_dif = np.abs(outputNumpyData - expectNumpyData).max()
+    max_exp_val = np.abs(expectNumpyData).max()
+    diff_rate = max_abs_dif / max_exp_val
+    if diff_rate > tolerance:
+        print(f'# Error: max_abs_dif: {max_abs_dif}, max_exp_val: {max_exp_val}, diff_rate: {diff_rate}')
+        return False
+    return True
 
 def log_result(success, model):
     global total_num
@@ -240,3 +246,6 @@ def testPymnnConfig(model_root_dir):
         for wrong in wrongs:
             print(wrong)
     print('TEST_NAME_PYMNN_MODEL: Pymnn模型测试\nTEST_CASE_AMOUNT_PYMNN_MODEL: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(wrongs), total_num - len(wrongs)))
+    print('TEST_CASE={\"name\":\"Pymnn模型测试\",\"failed\":%d,\"passed\":%d}\n'%(len(wrongs), total_num - len(wrongs)))
+    if len(wrongs) > 0:
+       exit(1)
diff --git a/pymnn/test/unit_test.py b/pymnn/test/unit_test.py
index d9addf734..853c78961 100644
--- a/pymnn/test/unit_test.py
+++ b/pymnn/test/unit_test.py
@@ -45,6 +45,7 @@ def tearDownClass(cls):
         skipped = len(cls.skipped)
         try:
             print('\nTEST_NAME_PYMNN_UNIT: Pymnn单元测试\nTEST_CASE_AMOUNT_PYMNN_UNIT: {\"blocked\":%d,\"failed\":%d,\"passed\":%d,\"skipped\":%d}\n'%(blocked, failed, passed, skipped))
+            print('\nTEST_CASE={\"name\":\"Pymnn单元测试\",\"failed\":%d,\"passed\":%d}\n'%(failed, passed))
         except:
             print('\nTEST_NAME_PYMNN_UNIT: PymnnUnitTest\nTEST_CASE_AMOUNT_PYMNN_UNIT: {\"blocked\":%d,\"failed\":%d,\"passed\":%d,\"skipped\":%d}\n'%(blocked, failed, passed, skipped))
     def run(self, result=None):
diff --git a/pymnn/update_mnn_wrapper_assets.sh b/pymnn/update_mnn_wrapper_assets.sh
index d0cbedc47..cb476cafb 100755
--- a/pymnn/update_mnn_wrapper_assets.sh
+++ b/pymnn/update_mnn_wrapper_assets.sh
@@ -43,7 +43,7 @@ find . -name __pycache__ | xargs rm -rf
 if cmdExist pyenv; then
     pyenv global $py_version
 fi
-python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)"
+python2 -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)"
 find . -name "*.py" | xargs rm -rf
 cd ..
 zip -r MNN.zip MNN
diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
index 19038ec94..2e4e9dc6b 100644
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@@ -691,6 +691,11 @@ bool Arm82Functions::init() {
 #define FUNC_PTR_ASSIGN(dst, src) dst = (decltype(dst))(src)
     gInstance = new CoreFunctions;
 
+    FUNC_PTR_ASSIGN(gInstance->MNNFp32ToFp8, MNNFp32ToFp8);
+    FUNC_PTR_ASSIGN(gInstance->MNNFp16ToFp8, MNNFp16ToFp8);
+    FUNC_PTR_ASSIGN(gInstance->MNNFp8ToFp32, MNNFp8ToFp32);
+    FUNC_PTR_ASSIGN(gInstance->MNNFp8ToFp16, MNNFp8ToFp16);
+
     FUNC_PTR_ASSIGN(gInstance->MNNFp32ToLowp, MNNQuantizeFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNLowpToFp32, MNNDequantizeFP16);
     gInstance->bytes = 2;
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S b/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S
index 680e6f2ac..7c32ca912 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S
@@ -116,7 +116,7 @@ stp d8,  d9,  [sp, #(16 * 3)]
 Start:
 ld1 {v31.8h}, [x0], #16
 sub x3, x3, #1
-mov v30.8h, v31.8h // v30:min v31:max
+mov v30.16b, v31.16b // mov v30.8h, v31.8h // v30:min v31:max
 
 
 TILE_24:
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
index 118b4f104..ae6dd794b 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
@@ -139,9 +139,9 @@ LoopH:
     sxtl v1.8h, v2.8b
     scvtf v0.8h, v0.8h
     scvtf v1.8h, v1.8h
-    mov v2.8h, v7.8h
+    mov v2.16b, v7.16b // mov v2.8h, v7.8h
     fmla v2.8h, v1.8h, v5.8h
-    mov v1.8h, v6.8h
+    mov v1.16b, v6.16b // mov v1.8h, v6.8h
     fmla v1.8h, v0.8h, v4.8h
 
     cbnz x19, LH8_BLOCK_GT_0
@@ -187,9 +187,9 @@ LoopH:
         sxtl v1.8h, v2.8b
         scvtf v0.8h, v0.8h
         scvtf v1.8h, v1.8h
-        mov v2.8h, v7.8h
+        mov v2.16b, v7.16b // mov v2.8h, v7.8h
         fmla v2.8h, v1.8h, v5.8h
-        mov v1.8h, v6.8h
+        mov v1.16b, v6.16b // mov v1.8h, v6.8h
         fmla v1.8h, v0.8h, v4.8h
 
         ld1 {v0.8h}, [x15], #16
@@ -254,7 +254,7 @@ LoopHRemain:
 
     sxtl v3.8h, v3.8b
     scvtf v6.8h, v3.8h
-    mov v3.8h, v21.8h
+    mov v3.16b, v21.16b // mov v3.8h, v21.8h
     fmla v3.8h, v6.8h, v20.8h
 
     ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
@@ -303,7 +303,7 @@ LoopHRemain:
         sxtl v3.8h, v3.8b
 
         scvtf v6.8h, v3.8h
-        mov v3.8h, v21.8h
+        mov v3.16b, v21.16b // mov v3.8h, v21.8h
         fmla v3.8h, v6.8h, v20.8h
 
         ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
index 8f92ac238..52ab11e13 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
@@ -132,9 +132,9 @@ LoopH:
     sxtl2 v1.8h, v2.16b
     scvtf v0.8h, v0.8h
     scvtf v1.8h, v1.8h
-    mov v2.8h, v7.8h
+    mov v2.16b, v7.16b // mov v2.8h, v7.8h
     fmla v2.8h, v1.8h, v5.8h
-    mov v1.8h, v6.8h
+    mov v1.16b, v6.16b // mov v1.8h, v6.8h
     fmla v1.8h, v0.8h, v4.8h
 
     cbnz x19, LH8_BLOCK_GT_0
@@ -174,9 +174,9 @@ LoopH:
         sxtl2 v1.8h, v2.16b
         scvtf v0.8h, v0.8h
         scvtf v1.8h, v1.8h
-        mov v2.8h, v7.8h
+        mov v2.16b, v7.16b // mov v2.8h, v7.8h
         fmla v2.8h, v1.8h, v5.8h
-        mov v1.8h, v6.8h
+        mov v1.16b, v6.16b // mov v1.8h, v6.8h
         fmla v1.8h, v0.8h, v4.8h
 
         ld1 {v0.8h}, [x15], #16
@@ -235,7 +235,7 @@ LoopHRemain:
     ld1 {v3.16b}, [x2], #16
     sxtl v3.8h, v3.8b
     scvtf v6.8h, v3.8h
-    mov v3.8h, v21.8h
+    mov v3.16b, v21.16b // mov v3.8h, v21.8h
     fmla v3.8h, v6.8h, v20.8h
 
     ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
@@ -278,7 +278,7 @@ LoopHRemain:
         ld1 {v3.16b}, [x2], #16
         sxtl v0.8h, v3.8b
         scvtf v6.8h, v0.8h
-        mov v3.8h, v21.8h
+        mov v3.16b, v21.16b // mov v3.8h, v21.8h
         fmla v3.8h, v6.8h, v20.8h
 
         ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
index 3949f7414..f23d2902c 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
@@ -85,8 +85,8 @@ LoopE8:
         sxtl v2.8h, v4.8b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -158,8 +158,8 @@ LoopE8:
 
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -276,7 +276,7 @@ LoopE8:
 
         sxtl v1.8h, v3.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.8h}, [x15], x11
@@ -316,7 +316,7 @@ LoopE8:
 
             sxtl v1.8h, v3.8b
             scvtf v1.8h, v1.8h
-            mov v3.8h, v14.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
             fmla v3.8h, v1.8h, v12.8h
 
             ld1 {v0.8h}, [x15], x11
@@ -410,8 +410,8 @@ blt E1
         sxtl v2.8h, v4.8b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -459,8 +459,8 @@ blt E1
 
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -541,7 +541,7 @@ blt E1
 
     sxtl v1.8h, v3.8b
     scvtf v1.8h, v1.8h
-    mov v3.8h, v14.8h
+    mov v3.16b, v14.16b // mov v3.8h, v14.8h
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.4h}, [x15], x11
@@ -570,7 +570,7 @@ blt E1
 
         sxtl v1.8h, v3.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.4h}, [x15], x11
@@ -647,8 +647,8 @@ LoopE1:
         sxtl v2.8h, v4.8b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -680,8 +680,8 @@ LoopE1:
             sxtl v2.8h, v4.8b
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -734,7 +734,7 @@ LoopE1:
 
     sxtl v1.8h, v3.8b
     scvtf v1.8h, v1.8h
-    mov v3.8h, v14.8h
+    mov v3.16b, v14.16b // mov v3.8h, v14.8h
     fmla v3.8h, v1.8h, v12.8h
     ld1 {v0.h}[0], [x15], x11
     cbnz x26, LE1H4_BLOCK_GT_0
@@ -756,7 +756,7 @@ LoopE1:
 
         sxtl v1.8h, v3.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
         ld1 {v0.h}[0], [x15], x11
         fmla v16.8h, v3.8h, v0.h[0]
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
index f73046ec0..28d34b174 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
@@ -75,8 +75,8 @@ LoopE8:
         sxtl2 v2.8h, v10.16b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -139,8 +139,8 @@ LoopE8:
             sxtl2 v2.8h, v10.16b
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -253,7 +253,7 @@ LoopE8:
         ld1 {v10.16b}, [x13], #16
         sxtl v1.8h, v10.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.8h}, [x15], x11
@@ -289,7 +289,7 @@ LoopE8:
             ld1 {v10.16b}, [x13], #16
             sxtl v1.8h, v10.8b
             scvtf v1.8h, v1.8h
-            mov v3.8h, v14.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
             fmla v3.8h, v1.8h, v12.8h
 
             ld1 {v0.8h}, [x15], x11
@@ -377,8 +377,8 @@ blt E1
         sxtl2 v2.8h, v10.16b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -419,8 +419,8 @@ blt E1
             sxtl2 v2.8h, v10.16b
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -497,7 +497,7 @@ blt E1
     ld1 {v10.16b}, [x13], #16
     sxtl v1.8h, v10.8b
     scvtf v1.8h, v1.8h
-    mov v3.8h, v14.8h
+    mov v3.16b, v14.16b // mov v3.8h, v14.8h
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.4h}, [x15], x11
@@ -522,7 +522,7 @@ blt E1
         ld1 {v10.16b}, [x13], #16
         sxtl v1.8h, v10.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
 
         ld1 {v0.4h}, [x15], x11
@@ -593,8 +593,8 @@ LoopE1:
         sxtl2 v2.8h, v10.16b
         scvtf v1.8h, v1.8h
         scvtf v2.8h, v2.8h
-        mov v3.8h, v14.8h
-        mov v4.8h, v15.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
+        mov v4.16b, v15.16b // mov v4.8h, v15.8h
         fmla v3.8h, v1.8h, v12.8h
         fmla v4.8h, v2.8h, v13.8h
 
@@ -620,8 +620,8 @@ LoopE1:
             sxtl2 v2.8h, v10.16b
             scvtf v1.8h, v1.8h
             scvtf v2.8h, v2.8h
-            mov v3.8h, v14.8h
-            mov v4.8h, v15.8h
+            mov v3.16b, v14.16b // mov v3.8h, v14.8h
+            mov v4.16b, v15.16b // mov v4.8h, v15.8h
             fmla v3.8h, v1.8h, v12.8h
             fmla v4.8h, v2.8h, v13.8h
 
@@ -669,7 +669,7 @@ LoopE1:
     ld1 {v10.16b}, [x13], #16
     sxtl v1.8h, v10.8b
     scvtf v1.8h, v1.8h
-    mov v3.8h, v14.8h
+    mov v3.16b, v14.16b // mov v3.8h, v14.8h
     fmla v3.8h, v1.8h, v12.8h
 
     ld1 {v0.h}[0], [x15], x11
@@ -687,7 +687,7 @@ LoopE1:
         ld1 {v10.16b}, [x13], #16
         sxtl v1.8h, v10.8b
         scvtf v1.8h, v1.8h
-        mov v3.8h, v14.8h
+        mov v3.16b, v14.16b // mov v3.8h, v14.8h
         fmla v3.8h, v1.8h, v12.8h
         ld1 {v0.h}[0], [x15], x11
         fmla v16.8h, v3.8h, v0.h[0]
diff --git a/source/backend/coreml/backend/CoreMLExecutor.mm b/source/backend/coreml/backend/CoreMLExecutor.mm
index 2e9d99aac..7b664a0cd 100644
--- a/source/backend/coreml/backend/CoreMLExecutor.mm
+++ b/source/backend/coreml/backend/CoreMLExecutor.mm
@@ -106,31 +106,20 @@ - (MLFeatureValue*)featureValueForName:(NSString*)featureName {
     for (auto& input : *_inputs) {
         if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.second) {
             auto input_shape = input.first->shape();
-            NSArray* shape = @[
-                @(input_shape[0]),
-                @(input_shape[1]),
-                @(input_shape[2]),
-            ];
-            NSArray* strides = @[
-                @(input_shape[1] * input_shape[2]),
-                @(input_shape[2]),
-                @1,
-            ];
-
-            if ([self coreMlVersion] >= 3) {
-                shape = @[
-                    @(input_shape[0]),
-                    @(input_shape[1]),
-                    @(input_shape[2]),
-                    @(input_shape[3]),
-                ];
-                strides = @[
-                    @(input_shape[1] * input_shape[2] * input_shape[3]),
-                    @(input_shape[2] * input_shape[3]),
-                    @(input_shape[3]),
-                    @1,
-                ];
-            };
+            NSMutableArray* shape = [NSMutableArray arrayWithCapacity:input_shape.size()];
+            NSMutableArray* strides = [NSMutableArray arrayWithCapacity:input_shape.size()];
+            std::vector<int> stridesDim(input_shape.size());
+            int curStride = 1;
+            if (input_shape.size() >= 1) {
+                for (int i=input_shape.size()-1; i>=0; --i) {
+                    stridesDim[i] = curStride;
+                    curStride *= input_shape[i];
+                }
+            }
+            for (int i=0; i<input_shape.size(); ++i) {
+                [shape addObject:@(input_shape[i])];
+                [strides addObject:@(stridesDim[i])];
+            }
             auto tensor = input.first;
             if (tensor->getType() == halide_type_of<uint8_t>()) {
                 CVPixelBufferRef pixelBuffer = NULL;
@@ -210,6 +199,7 @@ - (bool)invokeWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::s
                 const_cast<MNN::Tensor*>(output.first)->buffer().host = (unsigned char*)data.dataPointer;
             }
         }
+        inputFeature = nil;
     }
     return YES;
 }
diff --git a/source/backend/coreml/execution/CoreMLConvolution.cpp b/source/backend/coreml/execution/CoreMLConvolution.cpp
index 2d335af36..7e1a22fb6 100644
--- a/source/backend/coreml/execution/CoreMLConvolution.cpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.cpp
@@ -29,7 +29,7 @@ void CoreMLConvolution::loadWeightBias(const std::vector<Tensor *> &inputs) {
     }
     auto conv2D = mOp->main_as_Convolution2D();
     if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
+        quanCommon = ConvolutionCommon::load(mOp, backend(), true);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
         }
@@ -84,7 +84,7 @@ void CoreMLConvolution::addPadLayer(const Tensor * input, const Convolution2DCom
         bottom = (pad_out_height - inputHeight) - top;
         left = (pad_out_width - inputWidth) / 2;
         right = (pad_out_width - inputWidth) - left;
-        
+
         if (top < 0 || bottom < 0 || left < 0 || right < 0) {
             isSamePadding = true;
             pad_out_width = outputWidth / sx;
diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp
index a420f2d0d..8a5a89ec3 100644
--- a/source/backend/cpu/CPUAttention.cpp
+++ b/source/backend/cpu/CPUAttention.cpp
@@ -27,87 +27,8 @@
 // reduce the value of 'query' to 'query * FP16_QSCALE', avoid fp16 overflow
 #define FP16_QSCALE 0.5
 
-#define FP8_E5M2
-
 namespace MNN {
 
-#if defined FP8_E5M2  // E5M2 : [S E E E E E M M]
-typedef uint8_t fp8_t;
-static inline fp8_t fp16_to_fp8(FLOAT16_T x) {
-    return *((fp8_t *)(&x) + 1);
-}
-static FLOAT16_T fp8_to_fp16(fp8_t x) {
-    uint16_t rawData = 0;
-    rawData |= (uint16_t)x << 8;
-    return *((FLOAT16_T *)(&rawData));
-}
-static inline fp8_t float_to_fp8(float x) {
-    uint32_t rawData = *((uint32_t *)(&x));
-    int sign = (rawData >> 31) & 1U;
-    int exp = (int)((rawData >> 23) & 0x0ffU) - 127;
-    if (exp < -16)
-        exp = -16;
-    if (exp > 15)
-        exp = 15;
-    exp += 16;    // exp [-16, 15] ==> [0, 31]
-    int mant = (rawData >> 21) & 3U;
-    return (sign << 7) | (exp << 2) | mant;
-}
-static inline float fp8_to_float(fp8_t x) {
-    uint32_t sign = (x >> 7) & 1U;
-    uint32_t exp = (int)((x >> 2) & 0x1fU) - 16 + 127;
-    uint32_t mant = (x & 3U) << 21;
-    uint32_t rawData = (sign << 31) | (exp << 23) | mant;
-    return *((float *)(&rawData));
-}
-#elif defined FP8_E4M3  // E4M3: [S E E E E M M M]
-typedef uint8_t fp8_t;
-static inline fp8_t fp16_to_fp8(FLOAT16_T x) {
-    uint16_t rawData = *((uint16_t *)(&x));
-    int sign = (rawData >> 15) & 1U;
-    int exp  = (int)((rawData >> 10) & 0x1fU) - 15;
-    if (exp < -8)
-        exp = -8;
-    if (exp > 7)
-        exp = 7;
-    exp += 8;     // exp [-8, 7] ==> [0, 15]
-    int mant = (rawData >> 7) & 7U;
-    return (sign << 7) | (exp << 3) | mant;
-}
-static FLOAT16_T fp8_to_fp16(fp8_t x) {
-    uint32_t sign = (x >> 7) & 1U;
-    uint32_t exp = (int)((x >> 3) & 0x0fU) - 8 + 15;
-    uint32_t mant = (x & 7U) << 7;
-    uint16_t rawData = (sign << 15) | (exp << 10) | mant;
-    return *((FLOAT16_T *)(&rawData));
-}
-static inline fp8_t float_to_fp8(float x) {
-    uint32_t rawData = *((uint32_t *)(&x));
-    int sign = (rawData >> 31) & 1U;
-    int exp = (int)((rawData >> 23) & 0x0ffU) - 127;
-    if (exp < -8)
-        exp = -8;
-    if (exp > 7)
-        exp = 7;
-    exp += 8;     // exp [-8, 7] ==> [0, 15]
-    int mant = (rawData >> 20) & 7U;
-    return (sign << 7) | (exp << 3) | mant;
-}
-static inline float fp8_to_float(fp8_t x) {
-    uint32_t sign = (x >> 7) & 1U;
-    uint32_t exp = (int)((x >> 3) & 0x0fU) - 8 + 127;
-    uint32_t mant = (x & 7U) << 20;
-    uint32_t rawData = (sign << 31) | (exp<< 23) | mant;
-    return *((float *)(&rawData));
-}
-#else
-// Do not support fp8
-#endif  // fp8 format definition
-
-static int nearestInt(float x) {
-    return x < 0 ? -nearestInt(-x) : (int)(x + 0.5f);
-}
-
 template <typename T>
 static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim, int eP, int seq_len, int h, float q_scale) {
     T * query_src = query->host<T>();
@@ -121,99 +42,6 @@ static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim,
     }
 }
 
-template <typename T>
-static void pack_key(Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, char* scale, char* zero_point, bool quant) {
-    if (quant) {  // Quantize the keys
-        auto key_src = key->host<T>();
-        auto key_dst = reinterpret_cast<int8_t*>(pack_key);
-        auto scale_dst = reinterpret_cast<T*>(scale);
-        auto zeroPoint_dst = reinterpret_cast<T*>(zero_point);
-        for (int i = 0; i < seq_len; i++) {
-            float minKey = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + 0];
-            float maxKey = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + 0];
-            for (int j = 1; j < mHeadDim; j++) {
-                auto key = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j];
-                minKey = ALIMIN(minKey, key);
-                maxKey = ALIMAX(maxKey, key);
-            }
-            int out_index = (mPastLength + i) / hP;
-            int in_index  = (mPastLength + i) % hP;
-            scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f;
-            zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey;
-            for (int j = 0; j < mHeadDim; j++) {
-                key_dst[out_index * mHeadDim * hP + j * hP + in_index] = nearestInt((key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j] - minKey) / (maxKey - minKey) * 255 - 128);
-            }
-        }
-    }
-    else {  // Do not quantize the keys
-        auto key_src = key->host<T>();
-        auto key_dst = reinterpret_cast<T*>(pack_key);
-        for (int i = 0; i < seq_len; i++) {
-            int out_index = (mPastLength + i) / hP;
-            int in_index  = (mPastLength + i) % hP;
-            for (int j = 0; j < mHeadDim; j++) {
-                key_dst[out_index * mHeadDim * hP + j * hP + in_index] = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j];
-            }
-        }
-    }
-}
-
-
-
-template <typename T>
-static void pack_value(Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quant) {
-    if (quant) {  // Quantize the values to fp8
-        T * value_src = value->host<T>();
-        fp8_t * value_dst = reinterpret_cast<fp8_t*>(pack_value);
-        for (int i = 0; i < seq_len; i++) {
-            for (int j = 0; j < mHeadDim; j++) {
-                int out_index = j / hP;
-                int in_index  = j % hP;
-                auto origin = value_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j];
-                if (sizeof(T) == 2)
-                    value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = fp16_to_fp8(origin);
-                else
-                    value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = float_to_fp8(origin);
-            }
-        }
-    }
-    else {  // Do not quantize the values
-        T * value_src = value->host<T>();
-        T * value_dst = reinterpret_cast<T*>(pack_value);
-        for (int i = 0; i < seq_len; i++) {
-            for (int j = 0; j < mHeadDim; j++) {
-                int out_index = j / hP;
-                int in_index  = j % hP;
-                value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = value_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j];
-            }
-        }
-    }
-}
-
-void dequant_value_float(char * dst, char * src, int mHeadDim, int kv_seq_len, int hP, int mMaxLength) {
-    fp8_t * qv = (fp8_t *)src;
-    float * dqv = (float *)dst;
-    for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-        for (int j = 0; j < kv_seq_len; j++) {
-            for (int k = 0; k < hP; k++) {
-                dqv[i * kv_seq_len * hP + j * hP + k] = fp8_to_float(qv[i * mMaxLength * hP + j * hP + k]);
-            }
-        }
-    }
-}
-
-void dequant_value_fp16(char * dst, char * src, int mHeadDim, int kv_seq_len, int hP, int mMaxLength) {
-    fp8_t * qv = (fp8_t *)src;
-    FLOAT16_T * dqv = (FLOAT16_T *)dst;
-    for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-        for (int j = 0; j < kv_seq_len; j++) {
-            for (int k = 0; k < hP; k++) {
-                dqv[i * kv_seq_len * hP + j * hP + k] = fp8_to_fp16(qv[i * mMaxLength * hP + j * hP + k]);
-            }
-        }
-    }
-}
-
 template <typename T>
 static void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len, int unit) {
     float * dst = unpack_qk_dst;
@@ -285,95 +113,6 @@ static void unpack_QKV(char* pack_qkv, char* unpack_qkv, int mNumHead, int mHead
     }
 }
 
-void CPUAttention::allocKVCache(int kv_seq_len, bool quantKey, bool quantValue) {
-    if (!mKVCache) {
-        return;
-    }
-    mResource->mMaxLength = kv_seq_len + mResource->mExpandChunk;
-    if (quantKey) {
-        mResource->mPastKey.reset(Tensor::createDevice<int8_t>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}));
-        mResource->mDequantKeyScale.reset(Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP}));
-        mResource->mDequantKeyZeroPoint.reset(Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP}));
-        backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC);
-        backend()->onAcquireBuffer(mResource->mDequantKeyScale.get(), Backend::STATIC);
-        backend()->onAcquireBuffer(mResource->mDequantKeyZeroPoint.get(), Backend::STATIC);
-    } else {
-        mResource->mPastKey.reset(Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}));
-        backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC);
-    }
-    if (quantValue) {
-        mResource->mPastValue.reset(Tensor::createDevice<fp8_t>({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP}));
-        backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC);
-    } else {
-        mResource->mPastValue.reset(Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP}));
-        backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC);
-    }
-}
-
-void CPUAttention::reallocKVCache(int kv_seq_len, bool quantKey, bool quantValue) {
-    if (!mKVCache || kv_seq_len <= mResource->mMaxLength) {
-        return;
-    }
-    int oldMaxLength = mResource->mMaxLength;
-    mResource->mMaxLength = kv_seq_len + mResource->mExpandChunk;
-    if (quantKey) {
-        auto new_key = Tensor::createDevice<int8_t>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP});
-        auto new_scale = Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP});
-        auto new_zeroPoint = Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP});
-        backend()->onAcquireBuffer(new_key, Backend::STATIC);
-        backend()->onAcquireBuffer(new_scale, Backend::STATIC);
-        backend()->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
-        for (int h = 0; h < mResource->mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP,
-                    mResource->mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP,
-                    UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP);
-            memcpy(new_scale->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * hP * bytes,
-                    mResource->mDequantKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * bytes,
-                    UP_DIV(oldMaxLength, hP) * hP * bytes);
-            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * hP * bytes,
-                    mResource->mDequantKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * bytes,
-                    UP_DIV(oldMaxLength, hP) * hP * bytes);
-        }
-        mResource->mPastKey.reset(new_key);
-        mResource->mDequantKeyScale.reset(new_scale);
-        mResource->mDequantKeyZeroPoint.reset(new_zeroPoint);
-    }
-    else {
-        auto new_key = Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP});
-        backend()->onAcquireBuffer(new_key, Backend::STATIC);
-        for (int h = 0; h < mResource->mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes,
-                    mResource->mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP * bytes,
-                    UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP * bytes);
-        }
-        mResource->mPastKey.reset(new_key);
-    }
-    if (quantValue) {
-        auto new_value = Tensor::createDevice<fp8_t>({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP});
-        backend()->onAcquireBuffer(new_value, Backend::STATIC);
-        for (int h = 0; h < mResource->mKvNumHead; h++) {
-            for (int i = 0; i < UP_DIV(mResource->mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * mResource->mMaxLength * hP,
-                        mResource->mPastValue->host<char>() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * oldMaxLength * hP,
-                        oldMaxLength * hP);
-            }
-        }
-        mResource->mPastValue.reset(new_value);
-    }
-    else {
-        auto new_value = Tensor::createDevice<float>({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP});
-        backend()->onAcquireBuffer(new_value, Backend::STATIC);
-        for (int h = 0; h < mResource->mKvNumHead; h++) {
-            for (int i = 0; i < UP_DIV(mResource->mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * mResource->mMaxLength * hP * bytes,
-                        mResource->mPastValue->host<char>() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * oldMaxLength * hP * bytes,
-                        oldMaxLength * hP * bytes);
-            }
-        }
-        mResource->mPastValue.reset(new_value);
-    }
-}
-
 ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto core = static_cast<CPUBackend *>(backend())->functions();
     core->MNNGetMatMulPackMode(&eP, &lP, &hP);
@@ -383,11 +122,12 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
     auto query = inputs[0];
     auto key   = inputs[1];
     int seq_len = query->shape()[1];
-    mResource->mNumHead = query->shape()[2];
-    mResource->mHeadDim = query->shape()[3];
-    mResource->mKvNumHead = key->shape()[2];
-    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mResource->mHeadDim, eP}));
-    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mResource->mHeadDim, unit), seq_len, unit}));
+    mNumHead = query->shape()[2];
+    mHeadDim = query->shape()[3];
+    mKvNumHead = key->shape()[2];
+    mKVCacheManager->onResize(mKvNumHead, mHeadDim);
+    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP}));
+    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
     backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
     backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
     backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
@@ -396,7 +136,7 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
 }
 
 ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    auto core = static_cast<CPUBackend *>(backend())->functions();
+    auto core  = static_cast<CPUBackend *>(backend())->functions();
     auto query = inputs[0];
     auto key   = inputs[1];
     auto value = inputs[2];
@@ -410,19 +150,10 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     mIsPrefill = (seq_len > 1);
     // isPrefill and mask is Square Matrix, is FirstPrefill
     mIsFirstPrefill = mIsPrefill && (mask_kvlen == mask_seqlen);
-    int tileCount = UP_DIV(mResource->mNumHead, mThreadNum);
-    int group_size = mResource->mNumHead / mResource->mKvNumHead;
-
-    // 0: do not quant kv
-    // 1: only quant k
-    // 2: only quant v
-    // 3: quant kv
-    int quantKV = static_cast<CPUBackend *>(backend())->getRuntime()->hint().kvcacheQuantOption;
-    bool quantKey = (quantKV & 1) == 1;
-    bool quantValue = ((quantKV >> 1) & 1) == 1;
-
+    int tileCount = UP_DIV(mNumHead, mThreadNum);
+    int group_size = mNumHead / mKvNumHead;
     // reduce the value of 'query' to avoid fp16 overflow
-    float mScale = 1.0 / sqrt(mResource->mHeadDim);
+    float mScale = 1.0 / sqrt(mHeadDim);
     float q_scale = 1.0;
     if (bytes == 2) {
         q_scale = FP16_QSCALE;
@@ -430,133 +161,70 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     }
 
     if (mIsPrefill) {
-        // Only reset the kvcache in the first prefill, but keep the kvcache in subsequent prefill
         if (mIsFirstPrefill) {
-            mResource->mPastLength = 0;
-            allocKVCache(seq_len, quantKey, quantValue);
+            mKVCacheManager->onClear();
+            mKVCacheManager->onAlloc(seq_len);
         } else {
-            reallocKVCache(mResource->mPastLength + seq_len, quantKey, quantValue);
+            mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + seq_len);
         }
     } else { // Decode
-        reallocKVCache(mResource->mPastLength + 1, quantKey, quantValue);
+        mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + 1);
     }
-    int kv_seq_len  = mResource->mPastLength + seq_len;
-
+    // Add the new kv to the kvcache
+    mKVCacheManager->onPushBack(key, value);
+    int kv_seq_len  = mKVCacheManager->kvLength();
+    int max_len = mKVCacheManager->maxLength();
+    bool quant_key = mKVCacheManager->config()->mQuantKey;
+    bool quant_value = mKVCacheManager->config()->mQuantValue;
     // Temporary tensors for intermediate results
     std::shared_ptr<Tensor> packQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(kv_seq_len, unit), seq_len, unit}));
     std::shared_ptr<Tensor> unpackQK(Tensor::createDevice<int32_t>({mThreadNum, seq_len, kv_seq_len}));
     std::shared_ptr<Tensor> softmaxQK(Tensor::createDevice<int>({mThreadNum, seq_len, kv_seq_len}));
     std::shared_ptr<Tensor> newPackQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), kv_seq_len, eP}));
-    std::shared_ptr<Tensor> dequantV(Tensor::createDevice<float>({mThreadNum, UP_DIV(mResource->mHeadDim, hP), kv_seq_len, hP}));
+    std::shared_ptr<Tensor> dequantV(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), kv_seq_len, hP}));
     backend()->onAcquireBuffer(packQK.get(), Backend::STATIC);
     backend()->onAcquireBuffer(unpackQK.get(), Backend::STATIC);
     backend()->onAcquireBuffer(softmaxQK.get(), Backend::STATIC);
     backend()->onAcquireBuffer(newPackQK.get(), Backend::STATIC);
-    if (quantValue) {
+    if (quant_value) {
         backend()->onAcquireBuffer(dequantV.get(), Backend::STATIC);
+        mKVCacheManager->onDequantValue(dequantV.get());
     }
 
     std::function<void(int)> mCompute = [=](int tId) {
-        auto pack_q      = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP) * mResource->mHeadDim * eP * bytes;
+        auto pack_q      = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP) * mHeadDim * eP * bytes;
         auto pack_qk     = packQK->host<char>() + tId * UP_DIV(kv_seq_len, unit) * seq_len * unit * bytes;
         auto unpack_qk   = unpackQK->host<float>() + tId * seq_len * kv_seq_len;
-        auto softmax_qk   = softmaxQK->host<float>() + tId * seq_len * kv_seq_len;
+        auto softmax_qk  = softmaxQK->host<float>() + tId * seq_len * kv_seq_len;
         auto new_pack_qk = newPackQK->host<char>() + tId * UP_DIV(seq_len, eP) * kv_seq_len * eP * bytes;
-        auto pack_qkv    = mPackQKV->host<char>() + tId * UP_DIV(mResource->mHeadDim, unit) * seq_len * unit * bytes;
-        int head_index   = tId * tileCount;
-        for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) {
-            int    kv_h                 = h / group_size;
-            char * key_dst              = nullptr;
-            char * key_scale_dst        = nullptr;
-            char * key_zero_point_dst   = nullptr;
-            char * value_dst            = nullptr;
-            if (quantKey) {
-                key_dst = mResource->mPastKey->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP;
-                key_scale_dst = mResource->mDequantKeyScale->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * 1 * hP * bytes;
-                key_zero_point_dst = mResource->mDequantKeyZeroPoint->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * 1 * hP * bytes;
-            } else {
-                key_dst   = mResource->mPastKey->host<char>() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes;
-            }
-            if (quantValue) {
-                value_dst = mResource->mPastValue->host<char>() + kv_h * UP_DIV(mResource->mHeadDim, hP) * mResource->mMaxLength * hP;
-            } else {
-                value_dst = mResource->mPastValue->host<char>() + kv_h * UP_DIV(mResource->mHeadDim, hP) * mResource->mMaxLength * hP * bytes;
-            }
-            // pack for matmul
+        auto pack_qkv    = mPackQKV->host<char>() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes;
+        auto QxK         = quant_key ? core->MNNPackedMatMul_int8 : core->MNNPackedMatMul;
+        auto QxK_remain  = quant_key ? core->MNNPackedMatMulRemain_int8 : core->MNNPackedMatMulRemain;
+        int  head_index  = tId * tileCount;
+        for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) {
+            int    kv_h            = h / group_size;
+            char * key_addr        = mKVCacheManager->addrOfKey(kv_h);
+            char * scale_addr      = quant_key ? mKVCacheManager->addrOfScale(kv_h) : nullptr;
+            char * zero_point_addr = quant_key ? mKVCacheManager->addrOfZeroPoint(kv_h) : nullptr;
+            char * value_addr      = quant_value ? dequantV->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes : mKVCacheManager->addrOfValue(kv_h);
             if (bytes == 2) {
-                pack_query<FLOAT16_T>(query, pack_q, mResource->mNumHead, mResource->mHeadDim, eP, seq_len, h, q_scale);
-                pack_key<FLOAT16_T>(key, key_dst, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, key_scale_dst, key_zero_point_dst, quantKey);
-                pack_value<FLOAT16_T>(value, value_dst, mResource->mMaxLength, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, quantValue);
+                pack_query<FLOAT16_T>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
             } else {
-                pack_query<float>(query, pack_q, mResource->mNumHead, mResource->mHeadDim, eP, seq_len, h, q_scale);
-                pack_key<float>(key, key_dst, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, key_scale_dst, key_zero_point_dst, quantKey);
-                pack_value<float>(value, value_dst, mResource->mMaxLength, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, quantValue);
+                pack_query<float>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
             }
             // query @ key
             int loop_e = seq_len / eP;
             int remain = seq_len % eP;
+            size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0};
             for (int i = 0 ; i < loop_e; i++) {
-                size_t shapeParameters[7];
-                size_t* parameters = shapeParameters;
-                parameters[0] = eP * bytes;
-                parameters[1] = mResource->mHeadDim;
-                parameters[2] = kv_seq_len;
-                parameters[3] = seq_len * unit * bytes;
-                parameters[4] = 0;
-                parameters[5] = 0;
-                parameters[6] = 0;
-                if (quantKey) {
-                    core->MNNPackedMatMul_int8(
-                        (float*)(pack_qk + (i * eP * unit) * bytes),
-                        (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes),
-                        (float*)key_dst,
-                        parameters, nullptr, nullptr,
-                        (float*)key_scale_dst, (float*)key_zero_point_dst
-                    );
-                } else {
-                    core->MNNPackedMatMul(
-                        (float*)(pack_qk + (i * eP * unit) * bytes),
-                        (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes),
-                        (float*)key_dst,
-                        parameters, nullptr, nullptr,
-                        nullptr, nullptr
-                    );
-                }
-            }
-            {
-                size_t shapeParameters[7];
-                size_t* parameters = shapeParameters;
-                parameters[0] = eP * bytes;
-                parameters[1] = mResource->mHeadDim;
-                parameters[2] = kv_seq_len;
-                parameters[3] = seq_len * unit * bytes;
-                parameters[4] = 0;
-                parameters[5] = 0;
-                parameters[6] = 0;
-                if (quantKey) {
-                    core->MNNPackedMatMulRemain_int8(
-                        (float*)(pack_qk + (loop_e * eP * unit) * bytes),
-                        (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes),
-                        (float*)key_dst,
-                        remain, parameters, nullptr, nullptr,
-                        (float*)key_scale_dst, (float*)key_zero_point_dst
-                    );
-                } else {
-                    core->MNNPackedMatMulRemain(
-                        (float*)(pack_qk + (loop_e * eP * unit) * bytes),
-                        (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes),
-                        (float*)key_dst,
-                        remain, parameters, nullptr, nullptr,
-                        nullptr, nullptr
-                    );
-                }
+                QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
             }
+            QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
+            // qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP]
             if(bytes == 2) {
-                // unpack qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len]
                 unpack_QK<FLOAT16_T>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
                 mask_QK<FLOAT16_T>(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits<float>::lowest(), mask->host<int>(), float_mask);
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
-                // pack qk for qk @ v : [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP]
                 pack_QK<FLOAT16_T>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             } else {
                 unpack_QK<float>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
@@ -564,56 +232,20 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
                 pack_QK<float>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             }
-            // Dequantize values from fp8 to float
-            if (quantValue) {
-                char * qv = value_dst;
-                char * dqv = dequantV->host<char>() + tId * UP_DIV(mResource->mHeadDim, hP) * kv_seq_len * hP * bytes;
-                if (bytes == 2) {
-                    dequant_value_fp16(dqv, qv, mResource->mHeadDim, kv_seq_len, hP, mResource->mMaxLength);
-                } else {
-                    dequant_value_float(dqv, qv, mResource->mHeadDim, kv_seq_len, hP, mResource->mMaxLength);
-                }
-                value_dst = dqv;
-            }
             // qk @ v
+            shapeParameters[1] = kv_seq_len;
+            shapeParameters[2] = mHeadDim;
+            shapeParameters[5] = quant_value ? 0 : (max_len - kv_seq_len) * hP * bytes;
             for (int i = 0 ; i < loop_e; i++) {
-                size_t shapeParameters[6];
-                size_t* parameters = shapeParameters;
-                parameters[0]          = eP * bytes;
-                parameters[1]          = kv_seq_len;
-                parameters[2]          = mResource->mHeadDim;
-                parameters[3]          = seq_len * unit * bytes;
-                parameters[4]          = 0;
-                parameters[5]          = quantValue ? 0 : (mResource->mMaxLength - kv_seq_len) * hP * bytes;
-                core->MNNPackedMatMul(
-                    (float*)(pack_qkv + (i * eP * unit) * bytes),
-                    (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes),
-                    (float*)value_dst, parameters,
-                    nullptr, nullptr, nullptr, nullptr
-                );
-            }
-            {
-                size_t shapeParameters[6];
-                size_t* parameters = shapeParameters;
-                parameters[0]          = eP * bytes;
-                parameters[1]          = kv_seq_len;
-                parameters[2]          = mResource->mHeadDim;
-                parameters[3]          = seq_len * unit * bytes;
-                parameters[4]          = 0;
-                parameters[5]          = quantValue ? 0 : (mResource->mMaxLength - kv_seq_len) * hP * bytes;
-                core->MNNPackedMatMulRemain(
-                    (float*)(pack_qkv + (loop_e * eP * unit) * bytes),
-                    (float*)(new_pack_qk + (loop_e * kv_seq_len * eP) * bytes),
-                    (float*)value_dst, remain, parameters,
-                    nullptr, nullptr, nullptr, nullptr
-                );
+                core->MNNPackedMatMul((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes), (float*)value_addr, shapeParameters, nullptr, nullptr, nullptr, nullptr);
             }
+            core->MNNPackedMatMulRemain((float*)(pack_qkv + (loop_e * eP * unit) * bytes), (float*)(new_pack_qk + (loop_e * kv_seq_len * eP) * bytes), (float*)value_addr, remain, shapeParameters, nullptr, nullptr, nullptr, nullptr);
             // unpack: [head_dim/unit, seq_len, unit] -> [seq_len, num_head, head_dim]
-            auto dst_ptr = outputs[0]->host<char>() + h * mResource->mHeadDim * bytes;
+            auto dst_ptr = outputs[0]->host<char>() + h * mHeadDim * bytes;
             if (bytes == 2) {
-                unpack_QKV<int16_t>(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len);
+                unpack_QKV<int16_t>(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len);
             } else {
-                unpack_QKV<float>(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len);
+                unpack_QKV<float>(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len);
             }
         }
     };
@@ -623,12 +255,11 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     }
     MNN_CONCURRENCY_END();
 
-    mResource->mPastLength += seq_len;
     backend()->onReleaseBuffer(packQK.get(), Backend::STATIC);
     backend()->onReleaseBuffer(unpackQK.get(), Backend::STATIC);
     backend()->onReleaseBuffer(softmaxQK.get(), Backend::STATIC);
     backend()->onReleaseBuffer(newPackQK.get(), Backend::STATIC);
-    if (quantValue){
+    if (quant_value){
         backend()->onReleaseBuffer(dequantV.get(), Backend::STATIC);
     }
     return NO_ERROR;
@@ -639,14 +270,26 @@ bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) {
         return true;
     }
     auto tmp = new CPUAttention(bn, mKVCache);
-    tmp->mResource = mResource;
+    tmp->mKVCacheManager = mKVCacheManager;
     *dst = tmp;
     return true;
 }
 
-CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend) {
-    mKVCache = kv_cache;
-    mResource.reset(new Resource);
+CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend), mKVCache(kv_cache) {
+    if (mKVCache) {
+        MNN::KVCacheManager::KVCacheConfig kvconfig;
+        int kvcacheQuantOptions = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheQuantOption;
+        kvconfig.mQuantKey   = (kvcacheQuantOptions & 1);
+        kvconfig.mQuantValue = ((kvcacheQuantOptions >> 1) & 1);
+        kvconfig.mKVCacheDir = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheDirPath;
+        kvconfig.mKVCacheSizeLimit = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheSizeLimit;
+        kvconfig.mExpandChunk = 64;
+        mKVCacheManager.reset(new KVCacheManager(backend, kvconfig));
+    }
+}
+
+CPUAttention::~CPUAttention() {
+    
 }
 
 class CPUAttentionCreator : public CPUBackend::Creator {
@@ -662,4 +305,4 @@ REGISTER_CPU_OP_CREATOR_TRANSFORMER(CPUAttentionCreator, OpType_Attention);
 
 } // namespace MNN
 
-#endif
\ No newline at end of file
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE
\ No newline at end of file
diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp
index abf351249..4aba816f3 100644
--- a/source/backend/cpu/CPUAttention.hpp
+++ b/source/backend/cpu/CPUAttention.hpp
@@ -13,38 +13,32 @@
 
 #include <functional>
 #include "core/Execution.hpp"
+#include "MNN/ErrorCode.hpp"
+#include "KVCacheManager.hpp"
 
 namespace MNN {
 
-
 class CPUAttention : public Execution {
 public:
     CPUAttention(Backend *backend, bool kv_cache);
-    virtual ~CPUAttention() = default;
+    virtual ~CPUAttention();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
-    struct Resource {
-        std::shared_ptr<Tensor> mPastKey;               // numhead, [maxlen/eP, headdim, eP]
-        std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/eP, maxlen, eP]
-        std::shared_ptr<Tensor> mDequantKeyScale;       // numhead, [maxlen/eP, 1, eP]
-        std::shared_ptr<Tensor> mDequantKeyZeroPoint;   // numhead, [maxlen/eP, 1, eP]
-        int mPastLength = 0, mMaxLength = 0;
-        const int mExpandChunk = 64;
-        int mNumHead = 0, mKvNumHead = 0, mHeadDim = 0;
-    };
 private:
-    void allocKVCache(int kv_seq_len, bool quantK, bool quantV);
-    void reallocKVCache(int kv_seq_len, bool quantK, bool quantV);
-    bool mIsPrefill = true;
+    bool mIsPrefill      = true;
     bool mIsFirstPrefill = true;
-    bool mKVCache;
-    int mThreadNum = 1;
-    std::shared_ptr<Resource> mResource;
+    bool mKVCache        = true;
+    int bytes = 4;
+    int mThreadNum = 1;;
+    int eP, lP, hP, unit;
+    int mNumHead, mKvNumHead, mHeadDim;
     std::shared_ptr<Tensor> mPackQ, mPackQKV;
-    int eP, lP, hP, bytes, unit;
+    std::shared_ptr<KVCacheManager> mKVCacheManager = nullptr;
 };
+
 } // namespace MNN
 
 #endif // CPUATTENTION_HPP
-#endif
+
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index 5f1a75eab..99156a447 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -192,12 +192,14 @@ void CPURuntime::_resetThreadPool() {
     // Reset tid to rebind cpu if necessary
     mCurrentTID = 0;
 }
-void CPURuntime::onReset(int numberThread, const BackendConfig* config) {
+void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
     if (config != nullptr) {
-        mPrecision = config->precision;
         mPower = config->power;
-        mMemory = config->memory;
-        mFlags = config->flags;
+        if (full) {
+            mPrecision = config->precision;
+            mMemory = config->memory;
+            mFlags = config->flags;
+        }
     }
     mThreadNumber = numberThread;
     _resetThreadPool();
diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
index 1ac8721de..1286df907 100644
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@@ -25,7 +25,7 @@ class CPURuntime : public Runtime {
     virtual ~ CPURuntime();
     int onGetRuntimeStatus(RuntimeStatus statusEnum) const override;
     virtual Backend* onCreate(const BackendConfig* config) const override;
-    virtual void onReset(int numberThread, const BackendConfig* config) override;
+    virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
     virtual CompilerType onGetCompilerType() const override {
diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp
index 9c42008d9..109b4cc6a 100644
--- a/source/backend/cpu/CPUConvolution.cpp
+++ b/source/backend/cpu/CPUConvolution.cpp
@@ -85,7 +85,7 @@ CPUConvolution::MutableResourceInt8::MutableResourceInt8(std::shared_ptr<Resourc
     if (mValid) {
         mValid = backend->onAcquireBuffer(mScaleFloat.get(), Backend::STATIC);
     }
-    
+
 }
 
 void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<float> inputQuantInfo, std::vector<float> outputQuantInfo) {
@@ -116,7 +116,7 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
     if (mInputScale == 0 || mOutputScale == 0) {
         return;
     }
-    
+
     int size = mResource->mOutputCount;
     const int kernelNum = static_cast<int>(mResource->mInt8WeightKernelSum.size());
     auto biasData    = mResource->mOriginBias->host<float>();
@@ -143,22 +143,27 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
         biasfloat[i] = bias[i] * scale[i];
     }
 }
-std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam, int pack) {
+std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Op* op, int pack) {
+    auto convParam = op->main_as_Convolution2D();
     auto core = static_cast<CPUBackend*>(backend)->functions();
     // TODO: use different pack from float
     int UNIT = pack;
-    
+
     std::shared_ptr<CPUConvolution::ResourceInt8> resource(new ResourceInt8);
     // TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API
     const auto convCommon  = convParam->common();
     const auto group = convParam->common()->group();
     const auto outputCount = convCommon->outputCount();
     const auto outputChannleUp4 = UP_DIV(outputCount, UNIT) * UNIT;
-    
+
     int quanCount = outputChannleUp4;
-    if (convParam->quanParameter() && convParam->quanParameter()->alpha()) {
+    if (convParam->quanParameter() && convParam->quanParameter()->alpha() && convParam->quanParameter()->buffer()) { // For block quant models.
         quanCount = convParam->quanParameter()->alpha()->size();
-        quanCount = ROUND_UP(quanCount, UNIT);
+        quanCount = ROUND_UP(quanCount, UNIT); // If block quant applied, quanCount > outputChannelUp4
+        if (quanCount < outputChannleUp4) {
+            MNN_PRINT("quantCount < outputUp4, check if need.\n");
+            quanCount = outputChannleUp4;
+        }
     }
     resource->mOriginBias.reset(Tensor::createDevice<int32_t>({quanCount}));
     resource->mOriginScale.reset(Tensor::createDevice<uint8_t>({quanCount * core->bytes}));
@@ -185,7 +190,7 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     resource->mOutputCount = outputCount;
-    if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) {
+    if (!ConvolutionCommon::getConvInt8Parameters(op, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) {
         return nullptr;
     }
     if (convParam->bias() && convParam->quanParameter()->alpha()) {
@@ -195,6 +200,7 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
         resource->mWeightAsymmetricQuant = quanCommon->asymmetric;
     }
 
+    // TODO: first alloc.
     resource->mWeightInt8.reset(Tensor::createDevice<int8_t>({weightSize}));
     allocRes = backend->onAcquireBuffer(resource->mWeightInt8.get(), Backend::STATIC);
     if (!allocRes) {
@@ -238,6 +244,7 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
         resource->mOutputScale = convParam->quanParameter()->scaleOut();
     }
     auto weightDst = resource->mWeightInt8->host<int8_t>();
+    // TODO(yanxing): don't copy!
     memcpy(weightDst, weightSrc, resource->mWeightInt8->size());
     resource->mRelu = convCommon->relu() || convCommon->relu6();
     if (convParam->symmetricQuan() && convParam->symmetricQuan()->outputDataType() == MNN::DataType_DT_FLOAT) {
@@ -247,8 +254,9 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     return resource;
 }
 
-void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> resource, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8) {
+void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8) {
     /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
+    auto conv2d = op->main_as_Convolution2D();
     bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
     MNN_ASSERT(quanBuffer || resourceInt8);
     resource->backend = backend;
@@ -269,11 +277,11 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> re
     auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
     auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
     ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
-    
+
     std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon;
     // Load quant scale and bias
     if (quanBuffer) {
-        quantCommon = ConvolutionCommon::load(conv2d, backend, false, true);
+        quantCommon = ConvolutionCommon::load(op, backend, false, true);
         weightOrigin = quantCommon->weight.get(); // weight before reorder
 
         int h = quantCommon->alpha.size();
@@ -321,7 +329,7 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> re
             }
         }
     }
-    
+
     // Compute float weightKernelSum
     resource->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
     success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC);
@@ -347,6 +355,73 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> re
     }
 }
 
+void CPUConvolution::makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8) {
+    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
+    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
+    MNN_ASSERT(quanBuffer || resourceInt8);
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    // common parameters
+    int outputCount = conv2d->common()->outputCount();
+    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
+    int ocUp4 = ROUND_UP(outputCount, core->pack);
+    int8_t* weightOrigin;
+
+    // Save weight quant scale and bias: wf=scale*wi+bias
+    std::shared_ptr<Tensor> scaleBias(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
+    auto success = backend->onAcquireBuffer(scaleBias.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc dequant scaleBias memory error\n");
+        return;
+    }
+    auto alphaPtr = scaleBias->host<float>();
+    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
+    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
+    
+    // Load quant scale and bias
+    weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
+    auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
+    auto wScale = resourceInt8->mOriginScale->host<float>();
+    int h = ocUp4;
+    if (core->bytes == 2) {
+        std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
+        core->MNNFp32ToLowp(wScale, tmp.get(), h);
+        for (int i=0; i< h; ++i) {
+            reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
+            reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
+        }
+    } else {
+        for (int i=0; i< h; ++i) {
+            alphaPtr[i] = wScale[i];
+            biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
+        }
+    }
+    resourceInt8->mOriginScale = scaleBias;
+    
+    // Compute float weightKernelSum
+    resourceInt8->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
+    success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc dequant mWeightKernelSum memory error\n");
+        return;
+    }
+    auto weightKernelSum = resourceInt8->mWeightKernelSum->host<float>();
+    for (int i = 0; i < outputCount; ++i) {
+        int sum = 0;
+        for (int j = 0; j < LSize; ++j) {
+            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
+        }
+        if(core->bytes == 2) {
+            auto scale = reinterpret_cast<int16_t*>(alphaPtr)[i];
+            auto bias = reinterpret_cast<int16_t*>(biasPtr)[i];
+            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
+        } else {
+            auto scale = alphaPtr[i];
+            auto bias = biasPtr[i];
+            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
+        }
+    }
+}
+
 CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) {
     // Do nothing
 }
@@ -399,16 +474,16 @@ class CPUConvInt8Creator : public CPUBackend::Creator {
         return OneDNNConvInt8::create(backend, convOp, inputs, outputs);
 #endif
         auto core = static_cast<CPUBackend*>(backend)->functions();
-        auto res = CPUConvolution::makeResourceInt8(backend, convOp, core->pack);
+        auto res = CPUConvolution::makeResourceInt8(backend, op, core->pack);
 #ifdef MNN_USE_SPARSE_COMPUTE
         if (static_cast<CPUBackend*>(backend)->functions()->pack == 4 && convOp->sparseParameter() && SparseConvInt8TiledExecutor::shouldUseSparse(convOp)) {
-            return new SparseConvInt8TiledExecutor(backend, convOp, res);
+            return new SparseConvInt8TiledExecutor(backend, op, res);
         }
 #endif
         if (ConvInt8Winograd::mustUse(convOp)) {
             return new ConvInt8Winograd(backend, convOp, res);
         }
-        return new DenseConvInt8TiledExecutor(backend, convOp, res, false);
+        return new DenseConvInt8TiledExecutor(backend, op, res);
     }
 };
 
diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp
index d241007d6..a34f68aad 100644
--- a/source/backend/cpu/CPUConvolution.hpp
+++ b/source/backend/cpu/CPUConvolution.hpp
@@ -58,14 +58,16 @@ class CPUConvolution : public Execution {
         std::vector<float> mReluThreshold;
     };
     struct ResourceInt8 {
-        std::vector<int> mInt8WeightKernelSum;
-        std::shared_ptr<Tensor> mWeightInt8;
-        std::shared_ptr<Tensor> mOriginBias;
-        std::shared_ptr<Tensor> mOriginScale;
-        std::shared_ptr<Tensor> mWeightQuantZero;
+        std::vector<int> mInt8WeightKernelSum;     // PTQ's   sum, DynamicQ not use
+        std::shared_ptr<Tensor> mWeightInt8;       // PTQ's   and  DynamicQ's weight
+        std::shared_ptr<Tensor> mOriginBias;       // PTQ's   and  DynamicQ's bias
+        std::shared_ptr<Tensor> mOriginScale;      // PTQ's scale + bias, DynamicQ's alpha + zero;
+        std::shared_ptr<Tensor> mWeightQuantZero;  // PTQ's  zero
+        std::shared_ptr<Tensor> mWeightKernelSum;  // PTQ's   and  DynamicQ's weight kernel sum;
+        std::vector<float> mReluThreshold;
         // relu or relu6
         bool mRelu;
-        int mActBits;
+        int mActBits;  // quant bits
 
         int mOutputCount;
         bool mUseConvQuan = true;
@@ -97,8 +99,9 @@ class CPUConvolution : public Execution {
         int32_t mShiftBits = 14;
         bool mValid;
     };
-    static std::shared_ptr<ResourceInt8> makeResourceInt8(Backend *backend, const MNN::Convolution2D *convOp, int pack=4);
-    static void makeResource(Backend* backend, std::shared_ptr<Resource> resource, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8 = nullptr);
+    static std::shared_ptr<ResourceInt8> makeResourceInt8(Backend *backend, const MNN::Op *op, int pack=4);
+    static void makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8 = nullptr);
+    static void makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8);
     CPUConvolution(const Convolution2DCommon *convOp, Backend *b);
     virtual ~CPUConvolution() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
index 03767edfa..f3fdf2cb3 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -265,7 +265,7 @@ class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator {
         int originBiasSize   = 0;
         std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
         if (nullptr != conv2d->quanParameter()) {
-            quanCommon = ConvolutionCommon::load(conv2d, backend, true);
+            quanCommon = ConvolutionCommon::load(op, backend, true);
             // Back to float
             originWeight     = quanCommon->weightFloat.get();
             originWeightSize = quanCommon->weightFloat.size();
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index 0a1e6f813..0364ad58e 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -173,13 +173,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen
     auto biasPtr = _bias.data();
     auto scalePtr = _scale.data();
     auto betaPtr = _beta.data();
-    
+
     if (ModeInt8) {
-        ConvolutionCommon::getConvInt8Parameters(conv2d, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr);
+        ConvolutionCommon::getConvInt8Parameters(convOp, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr);
     } else {
-        ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2d, &tempWeight, &tempWeightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize);
     }
-    
+
     bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) &&
                    backend->onAcquireBuffer(cache.get(), Backend::STATIC);
     if (!success) {
@@ -299,7 +299,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     //int zeroPoint = 0;
 
     auto biasTensor = inputs[2];
-    
+
     // prepare for float2int8 if necessary.
     auto outputQuant = TensorUtils::getQuantInfo(outputs[0]);
     float scale = outputQuant[0];
@@ -333,7 +333,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
         }
         mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
         // tempInput->buffer().host = (uint8_t*)inputPtr;
-        
+
         needReleaseTempInput = false;
         TensorUtils::getDescribeOrigin(tempInput.get())->mem = new CPUMemObj(nullptr, TensorUtils::getDescribeOrigin(input)->mem->chunk(), 0);
         mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp
index c9e0427f0..ed932e0b4 100644
--- a/source/backend/cpu/CPUDeconvolution.hpp
+++ b/source/backend/cpu/CPUDeconvolution.hpp
@@ -45,17 +45,18 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
             auto conv2d = convOp->main_as_Convolution2D();
             auto common = conv2d->common();
             auto pack = static_cast<CPUBackend*>(b)->functions()->pack;
-            mResource = CPUConvolution::makeResourceInt8(backend(), conv2d, pack);
+            mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack);
             CPUConvolution::MutableResourceInt8 mutableResource(mResource, b);
             auto core = static_cast<CPUBackend*>(b)->int8Functions();
             auto gemmKernel = core->Int8GemmKernel;
             int UNIT, SRC_UNIT, DST_XUNIT;
             core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
             const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
-            const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt; 
+            const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt; 
             const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
+            const int ocDivUnit = UP_DIV(common->outputCount(), UNIT); 
             const int oc4 = ocDiv4 / kEleCnt;
-            const int bias_elesize = ocDiv4 * UNIT;
+            const int bias_elesize = ocDiv4 * pack;
             // set offset if use SSE.
             auto inputQuant = TensorUtils::getQuantInfo(input);
             auto inputZeroPoint = inputQuant[1];
@@ -66,7 +67,7 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
                 gemmKernel = core->Int8GemmKernelFast;
             }
             for (int a = 0; a < kEleCnt; ++a){
-                for (int oz = 0; oz < oc4 * UNIT; ++oz) {
+                for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) {
                 int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT;
                 for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) {
                     int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
@@ -74,7 +75,9 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
                     auto weightInt8Data = weightDataPtr[index];
                     offset += weightInt8Data * (-128);
                 }
-                _bias[a * oc4 * UNIT + oz] = offset;
+                if (oz < oc4 * pack) {
+                    _bias[a * oc4 * pack + oz] = offset;
+                }
         }
     }
 #else
@@ -82,7 +85,7 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
                 gemmKernel = core->Int8GemmKernelFast;
             }
 #endif
-            mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, conv2d, gemmKernel, _bias));
+            mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias));
         }
     }
     virtual ~CPUDeconvolutionOrigin() = default;
diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
index 768abbad0..4e1b7f04e 100644
--- a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp
@@ -27,7 +27,7 @@ CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, b, conv, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, b, convOp, &tempWeight, &tempWeightSize);
 
     // Reorder weight from whc -> pwhc4
     int kernelSize = depthQuad * core->pack * kw * kh;
diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
index 347dd4839..0df722bb4 100644
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@@ -252,7 +252,7 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator {
                UNIT = 4;
            }
         }
-        auto res = CPUConvolution::makeResourceInt8(backend, convOp, UNIT);
+        auto res = CPUConvolution::makeResourceInt8(backend, op, UNIT);
         const int kernelSize      = common->kernelX() * common->kernelY();
         const int outputCount     = common->outputCount();
         const int ocDivUnit       = UP_DIV(outputCount, UNIT);
diff --git a/source/backend/cpu/KVCacheManager.cpp b/source/backend/cpu/KVCacheManager.cpp
new file mode 100644
index 000000000..7804d3dd5
--- /dev/null
+++ b/source/backend/cpu/KVCacheManager.cpp
@@ -0,0 +1,467 @@
+//
+//  KVCacheManager.cpp
+//  MNN
+//
+//  Created by MNN on 2024/08/05.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_SUPPORT_TRANSFORMER_FUSE
+
+#include "KVCacheManager.hpp"
+#include "core/Concurrency.h"
+
+namespace MNN {
+
+// @brief  Translate an address to a hex number string
+static inline std::string addrToHex(void *addr) {
+    std::string result = "";
+    uint64_t n = (uint64_t)addr;
+    for(int i = 15; i >= 0; i--) {
+        int t = (n >> (i * 4)) & 0x0f;
+        result.push_back((t < 10) ? ('0' + t) : ('A' + t - 10));
+    }
+    return result;
+}
+
+void KVCacheManager::createKVCacheFile() {
+    // Each layer has its own kvcache, so we have to create a key file and a value file for each layer and the file name must be unique
+    // Here we use the address of the mResource as the file name because the addresses of mResource in different layers are guaranteed to be different
+    std::string fileName = addrToHex(this);
+    std::string pathk    = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".k";
+    std::string pathv    = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".v";
+    mKeyCacheFD   = MNNCreateFile(pathk.c_str());
+    mValueCacheFD = MNNCreateFile(pathv.c_str());
+    if (mKeyCacheFD == INVALID_FILE) {
+        MNN_PRINT("Failed to create the file: %s\n", pathk.c_str());
+    }
+    if (mValueCacheFD == INVALID_FILE) {
+        MNN_PRINT("Failed to create the file: %s\n", pathv.c_str());
+    }
+}
+
+void KVCacheManager::removeKVCacheFile() {
+    std::string fileName = addrToHex(this);
+    std::string pathk = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".k";
+    std::string pathv = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".v";
+    if (mKeyCacheFD != INVALID_FILE) {
+        MNNCloseFile(mKeyCacheFD);
+        mKeyCacheFD = INVALID_FILE;
+        if (MNNRemoveFile(pathk.c_str()) != MNN::NO_ERROR) {
+            MNN_PRINT("Failed to remove the file: %s\n", pathk.c_str());
+        }
+    }
+    if (mValueCacheFD != INVALID_FILE) {
+        MNNCloseFile(mValueCacheFD);
+        mValueCacheFD = INVALID_FILE;
+        if (MNNRemoveFile(pathv.c_str()) != MNN::NO_ERROR) {
+            MNN_PRINT("Failed to remove the file: %s\n", pathv.c_str());
+        }
+    }
+}
+
+void KVCacheManager::resetKVCacheFileSize(size_t keySize, size_t valueSize) {
+    if (MNNSetFileSize(mKeyCacheFD, keySize) != MNN::NO_ERROR || MNNSetFileSize(mValueCacheFD, valueSize) != MNN::NO_ERROR) {
+        MNN_PRINT("Failed to resize the kvcache files!\n");
+    }
+}
+
+/*
+**  @brief  Memory-map the kvcache file
+**  @hint   After memory-mapping, we can access the kvcache files with pointers, just like accessing memory buffer
+**          But the data actually resides in disk.
+**          The OS will set some kernel page cache and manage the data swaping, which we do not need to care.
+*/
+void KVCacheManager::mmapKVCache(size_t keySize, size_t valueSize)
+{
+    if (mMapKeyAddr == nullptr) {
+        mMapKeyAddr = (char *)MNNMmapFile(mKeyCacheFD, keySize);
+        if (mMapKeyAddr == nullptr) {
+            MNN_PRINT("Failed to memory-map the kvcache!\n");
+        }
+    }
+    if (mMapValueAddr == nullptr) {
+        mMapValueAddr = (char *)MNNMmapFile(mValueCacheFD, valueSize);
+        if (mMapValueAddr == nullptr) {
+            MNN_PRINT("Failed to memory-map the kvcache!\n");
+        }
+    }
+}
+
+void KVCacheManager::unmapKVCache(size_t keySize, size_t valueSize)
+{
+    if (mMapKeyAddr != nullptr) {
+        MNNUnmapFile(mMapKeyAddr, keySize);
+        mMapKeyAddr = nullptr;
+    }
+    if (mMapValueAddr != nullptr) {
+        MNNUnmapFile(mMapValueAddr, valueSize);
+        mMapValueAddr = nullptr;
+    }
+}
+
+/*
+**  @brief  Expand the size of kvcache and copy it from the old tensor in memory to the new tensor in memory
+**          Finally reset the pointer to the new tensor
+*/
+void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
+    /*===================================  Key  ===================================*/
+    if (mConfig.mQuantKey) {
+        auto new_key = Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
+        mBackend->onAcquireBuffer(new_key, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+        }
+        mPastKey.reset(new_key);
+    }
+    else {
+        auto new_key = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
+        mBackend->onAcquireBuffer(new_key, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+        }
+        mPastKey.reset(new_key);
+    }
+    /*===================================  Value  ===================================*/
+    if (mConfig.mQuantValue) {
+        auto new_value = Tensor::createDevice<fp8_t>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP});
+        mBackend->onAcquireBuffer(new_value, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+            }
+        }
+        mPastValue.reset(new_value);
+    }
+    else {
+        auto new_value = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP});
+        mBackend->onAcquireBuffer(new_value, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+            }
+        }
+        mPastValue.reset(new_value);
+    }
+}
+
+/*
+**  @brief  Move the kvcache from memory to the memory-mapped kvcache files in disk
+**          Then release the memory buffer of old kvcache
+*/
+void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
+    /*===================================  Key  ===================================*/
+    if (mConfig.mQuantKey) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+        }
+        mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
+        mPastKey.reset();
+    }
+    else {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+        }
+        mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
+        mPastKey.reset();
+    }
+    /*===================================  Value  ===================================*/
+    if (mConfig.mQuantValue) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+            }
+        }
+        mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
+        mPastValue.reset();
+    }
+    else {
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+            }
+        }
+        mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
+        mPastValue.reset();
+    }
+}
+
+/*
+**  @brief  Expand the size of kvcache files in disk
+*/
+void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) {
+    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    // Step 1: Copy the old kvcache from files to temporary buffers in memory
+    std::shared_ptr<Tensor> old_key, old_value;
+    if (mConfig.mQuantKey) {
+        old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));
+    } else {
+        old_key.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));  
+    }
+    if (mConfig.mQuantValue) {
+        old_value.reset(Tensor::createDevice<fp8_t>({mKvNumHead, UP_DIV(mHeadDim, hP), oldMaxLength, hP}));
+    } else {
+        old_value.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), oldMaxLength, hP}));
+    }
+    mBackend->onAcquireBuffer(old_key.get(), Backend::STATIC);
+    mBackend->onAcquireBuffer(old_value.get(), Backend::STATIC);
+    mmapKVCache(oldKeySize, oldValueSize);
+    memcpy(old_key->host<char>(),   mMapKeyAddr,   oldKeySize);
+    memcpy(old_value->host<char>(), mMapValueAddr, oldValueSize);
+    // Step 2: Resize the kvcache files and remap them
+    unmapKVCache(oldKeySize, oldValueSize);
+    resetKVCacheFileSize(keySize, valueSize);
+    mmapKVCache(keySize, valueSize);
+    // Step 3: Move the kvcache from temporary buffers in memory to disk
+    if (mConfig.mQuantKey) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+        }
+    } else {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+        }
+    }
+    if (mConfig.mQuantValue) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+            }
+        }
+    } else {
+        for (int h = 0; h < mKvNumHead; h++) {
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+            }
+        }
+    }
+    // Step 4: Release the temporary buffers
+    mBackend->onReleaseBuffer(old_key.get(), Backend::STATIC);
+    mBackend->onReleaseBuffer(old_value.get(), Backend::STATIC);
+}
+
+void KVCacheManager::onResize(int kv_num_head, int head_dim) {
+    mKvNumHead = kv_num_head;
+    mHeadDim = head_dim;
+    auto core  = static_cast<CPUBackend *>(mBackend)->functions();
+    core->MNNGetMatMulPackMode(&eP, &lP, &hP);
+    mBytes = core->bytes;
+    mThreadNum = static_cast<CPUBackend *>(mBackend)->threadNumber();
+    if (mThreadNum > mKvNumHead) {
+        mThreadNum = mKvNumHead;
+    }
+}
+
+void KVCacheManager::onAlloc(int kv_seq_len) {
+    mMaxLength = kv_seq_len + mConfig.mExpandChunk;
+    size_t keySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    /*============== Put the kvcache in disk ===========*/
+    if (mConfig.mKVCacheSizeLimit != -1 && keySize + valueSize > mConfig.mKVCacheSizeLimit) {
+        createKVCacheFile();
+        resetKVCacheFileSize(keySize, valueSize);
+        mmapKVCache(keySize, valueSize);
+        mKVCacheInDisk = true;
+    }
+    /*============== Put the kvcache in memory ===========*/
+    else {
+        if (mConfig.mQuantKey) {
+            mPastKey.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
+        } else {
+            mPastKey.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
+        }
+        if (mConfig.mQuantValue) {
+            mPastValue.reset(Tensor::createDevice<fp8_t>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}));
+        } else {
+            mPastValue.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}));
+        }
+        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC);    
+    }
+    /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
+    if (mConfig.mQuantKey) {
+        mDequantKeyScale.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
+        mDequantKeyZeroPoint.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
+        mBackend->onAcquireBuffer(mDequantKeyScale.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mDequantKeyZeroPoint.get(), Backend::STATIC);
+    }
+}
+
+void KVCacheManager::onRealloc(int kv_seq_len) {
+    if (kv_seq_len <= mMaxLength) {
+        return;
+    }
+    int oldMaxLength = mMaxLength;
+    mMaxLength = kv_seq_len + mConfig.mExpandChunk;
+    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    /*==== No limit for kvcache ====*/
+    if (mConfig.mKVCacheSizeLimit == -1) {
+        expandKVCacheInMem(oldMaxLength);
+    }
+    /*==== Last time the kvcache is memory, now it should be in memory too ====*/
+    else if (keySize + valueSize <= mConfig.mKVCacheSizeLimit) {
+        expandKVCacheInMem(oldMaxLength);
+    }
+    /*==== Last time the kvcache is in memory, but now it should be moved to disk ====*/
+    else if (oldKeySize + oldValueSize <= mConfig.mKVCacheSizeLimit) {
+        createKVCacheFile();
+        resetKVCacheFileSize(keySize, valueSize);
+        mmapKVCache(keySize, valueSize);
+        moveKVCacheFromMemToDisk(oldMaxLength);
+        mKVCacheInDisk = true;
+    }
+    /*==== Last time the kvcache is disk, now it should be in disk too ====*/
+    else {
+        expandKVCacheInDisk(oldMaxLength);
+    }
+    /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
+    if (mConfig.mQuantKey) {
+        auto new_scale = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
+        auto new_zeroPoint = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
+        mBackend->onAcquireBuffer(new_scale, Backend::STATIC);
+        mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+        }
+        mDequantKeyScale.reset(new_scale);
+        mDequantKeyZeroPoint.reset(new_zeroPoint);
+    }
+}
+
+void KVCacheManager::onClear() {
+    if (mKVCacheInDisk) {
+        size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+        size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+        unmapKVCache(oldKeySize, oldValueSize);
+        removeKVCacheFile();
+        mKVCacheInDisk = false;
+    }
+    else {
+        mPastKey.reset();
+        mPastValue.reset();
+    }
+    mMaxLength = mPastLength = 0;
+}
+
+template <typename T>
+static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim,
+                    int hP, int kv_h, bool quantKey, char* scale, char* zero_point, const MNN::CoreFunctions * core) {
+    if (quantKey) {
+        int8_t * key_dst = reinterpret_cast<int8_t*>(pack_key);
+        T * scale_dst = reinterpret_cast<T*>(scale);
+        T * zeroPoint_dst = reinterpret_cast<T*>(zero_point);
+        for (int i = 0; i < seq_len; i++) {
+            T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            int out_index = (mPastLength + i) / hP;
+            int in_index  = (mPastLength + i) % hP;
+            T minKey, maxKey;
+            core->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim);
+            scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f;
+            zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey;
+            for (int j = 0; j < mHeadDim; j++) {
+                key_dst[out_index * mHeadDim * hP + j * hP + in_index] = roundf((key_src[j] - minKey) / (maxKey - minKey) * 255 - 128);
+            }
+        }
+    }
+    else {
+        T * key_dst = reinterpret_cast<T*>(pack_key);
+        for (int i = 0; i < seq_len; i++) {
+            T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            int out_index = (mPastLength + i) / hP;
+            int in_index  = (mPastLength + i) % hP;
+            for (int j = 0; j < mHeadDim; j++) {
+                key_dst[out_index * mHeadDim * hP + j * hP + in_index] = key_src[j];
+            }
+        }
+    }
+}
+
+template <typename T>
+static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quantValue, const MNN::CoreFunctions * core) {
+    if (quantValue) {
+        fp8_t * value_dst = reinterpret_cast<fp8_t*>(pack_value);
+        uint8_t * buf = (uint8_t *)MNNMemoryAllocAlign(mHeadDim, MNN_MEMORY_ALIGN_DEFAULT);
+        for (int i = 0; i < seq_len; i++) {
+            T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            if (sizeof(T) == 2) {
+                core->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim);
+            } else {
+                core->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim);
+            }
+            for (int j = 0; j < mHeadDim; j++) {
+                int out_index = j / hP;
+                int in_index  = j % hP;
+                value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = buf[j];
+            }
+        }
+        MNNMemoryFreeAlign(buf);
+    }
+    else {
+        T * value_dst = reinterpret_cast<T*>(pack_value);
+        for (int i = 0; i < seq_len; i++) {
+            T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            for (int j = 0; j < mHeadDim; j++) {
+                int out_index = j / hP;
+                int in_index  = j % hP;
+                value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = value_src[j];
+            }
+        }
+    }
+}
+
+void KVCacheManager::onPushBack(const Tensor * key, const Tensor * value) {
+    auto core = static_cast<CPUBackend*>(mBackend)->functions();
+    int seq_len = key->shape()[1];
+    int tileCount = UP_DIV(mKvNumHead, mThreadNum);
+    std::function<void(int)> packKV = [=](int tid) {
+        for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) {
+            if (mBytes == 2) {
+                pack_key<FLOAT16_T>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
+                pack_value<FLOAT16_T>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+            } else {
+                pack_key<float>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
+                pack_value<float>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+            }
+        }
+    };
+    MNN_CONCURRENCY_BEGIN(tid, mThreadNum) {
+        packKV((int)tid);
+    }
+    MNN_CONCURRENCY_END();
+    mPastLength += seq_len;
+}
+
+void KVCacheManager::onDequantValue(Tensor * dequantedValues) {
+    auto core = static_cast<CPUBackend*>(mBackend)->functions();
+    int tileCount = UP_DIV(mKvNumHead, mThreadNum);
+    std::function<void(int)> dequant = [=](int tid) {
+        for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) {
+            char * dst = dequantedValues->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * mPastLength * hP * mBytes;
+            char * src = addrOfValue(kv_h);
+            for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
+                if (mBytes == 2) {
+                    core->MNNFp8ToFp16((uint16_t*)dst, (uint8_t*)src, mPastLength * hP);
+                } else {
+                    core->MNNFp8ToFp32((float*)dst, (uint8_t*)src, mPastLength * hP);
+                }
+                dst += mPastLength * hP * mBytes;
+                src += mMaxLength * hP;
+            }
+        }
+    };
+    MNN_CONCURRENCY_BEGIN(tid, mThreadNum) {
+        dequant((int)tid);
+    }
+    MNN_CONCURRENCY_END();
+}
+
+} // namespace MNN
+
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE
\ No newline at end of file
diff --git a/source/backend/cpu/KVCacheManager.hpp b/source/backend/cpu/KVCacheManager.hpp
new file mode 100644
index 000000000..582481990
--- /dev/null
+++ b/source/backend/cpu/KVCacheManager.hpp
@@ -0,0 +1,129 @@
+//
+//  KVCacheManager.hpp
+//  MNN
+//
+//  Created by MNN on 2024/08/05.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef MNN_SUPPORT_TRANSFORMER_FUSE
+
+#ifndef KVCACHE_MANAGER_HPP
+#define KVCACHE_MANAGER_HPP
+
+#include "core/Macro.h"
+#include "core/MNNFileUtils.h"
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+
+#if defined (__aarch64__)
+#define FLOAT16_T __fp16
+#else
+#define FLOAT16_T float
+#endif
+
+typedef uint8_t fp8_t;
+
+namespace MNN {
+
+class KVCacheManager : public NonCopyable{
+public:
+    struct KVCacheConfig {
+        bool mQuantKey   = false;               // Quantize keys to int8 or not
+        bool mQuantValue = false;               // Quantize values to fp8 or not
+        std::string mKVCacheDir = "/tmp";       // Path of the kvcache files in disk
+        size_t mKVCacheSizeLimit = -1;          // The limit of the kvcache size
+        int  mExpandChunk = 64;                 // Number of expand chunks when the buffer is full
+    };
+private:
+    Backend * mBackend;
+    KVCacheConfig mConfig;
+    std::shared_ptr<Tensor> mPastKey;               // numhead, [maxlen/eP, headdim, eP]
+    std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/eP, maxlen, eP]
+    std::shared_ptr<Tensor> mDequantKeyScale;       // numhead, [maxlen/eP, 1, eP]
+    std::shared_ptr<Tensor> mDequantKeyZeroPoint;   // numhead, [maxlen/eP, 1, eP]
+    file_t mKeyCacheFD   = INVALID_FILE;            // The file descriptor of keys
+    file_t mValueCacheFD = INVALID_FILE;            // The file descriptor of values
+    char * mMapKeyAddr   = nullptr;                 // Memory-mapped address of keys
+    char * mMapValueAddr = nullptr;                 // Memory-mapped address of values
+    bool mKVCacheInDisk  = false;                   // Whether the kvcache is in disk or in memory now
+    int  mPastLength     = 0;                       // Length of past kvcache
+    int  mMaxLength      = 0;                       // Capacity of current kvcache buffer (how many kv items can be stored at most)
+    int  eP, lP, hP, mBytes, mThreadNum;
+    int  mKvNumHead = 0, mHeadDim   = 0;
+    void createKVCacheFile();
+    void removeKVCacheFile();
+    void resetKVCacheFileSize(size_t keySize, size_t valueSize);
+    void mmapKVCache(size_t keySize, size_t valueSize);
+    void unmapKVCache(size_t keySize, size_t valueSize);
+    void expandKVCacheInMem(int oldMaxLength);
+    void moveKVCacheFromMemToDisk(int oldMaxLength);
+    void expandKVCacheInDisk(int oldMaxLength);
+public:
+    KVCacheManager(Backend * backend, KVCacheConfig & kvConfig) {
+        mBackend   = backend;
+        mConfig    = kvConfig; 
+    }
+    ~KVCacheManager() {
+        onClear();
+    }
+    const Backend * backend() {
+        return mBackend;
+    }
+    const KVCacheConfig * config() {
+        return &mConfig;
+    }
+    const Tensor * key() {
+        return mPastKey.get();
+    }
+    const Tensor * value() {
+        return mPastValue.get();
+    }
+    const Tensor * scale() {
+        return mDequantKeyScale.get();
+    }
+    const Tensor * zeroPoint() {
+        return mDequantKeyZeroPoint.get();
+    }
+    bool inDisk() {
+        return mKVCacheInDisk;
+    }
+    int kvLength() {
+        return mPastLength;
+    }
+    int maxLength() {
+        return mMaxLength;
+    }
+    char * addrOfKey(int kv_h) {
+        char * baseAddr = mKVCacheInDisk ? mMapKeyAddr : mPastKey->host<char>();
+        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+    }
+    char * addrOfValue(int kv_h) {
+        char * baseAddr = mKVCacheInDisk ? mMapValueAddr : mPastValue->host<char>();
+        return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    }
+    char * addrOfScale(int kv_h) {
+        if (mConfig.mQuantKey == false)
+            return nullptr;
+        char * baseAddr = mDequantKeyScale->host<char>();
+        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+    }
+    char * addrOfZeroPoint(int kv_h) {
+        if (mConfig.mQuantKey == false)
+            return nullptr;
+        char * baseAddr = mDequantKeyZeroPoint->host<char>();
+        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+    }
+    void onResize(int kv_num_head, int head_dim);
+    void onAlloc(int kv_seq_len);
+    void onRealloc(int kv_seq_len);
+    void onClear();
+    void onPushBack(const Tensor * key, const Tensor * value);
+    void onDequantValue(Tensor * dequantedValues);
+};
+
+} // namespace MNN
+
+#endif // KVCACHE_MANAGER_HPP
+
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE
\ No newline at end of file
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index 6368937de..f7988025b 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -67,7 +67,7 @@ ldr r12, [r6, #36] // f32minmax
 str r12, [sp, #12]
 ldr r12, [r6, #40] // blockNum
 mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
-lsl r12, r12, #6   // weight_stride = src_depth_quad*LP*HP
+lsl r12, r12, #5   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #16]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #20]
@@ -198,8 +198,8 @@ L2LoopDz:
     // vaddq.s32 q0, q8, q4 // add bias
     // vaddq.s32 q1, q9, q4
 
-    vcvt.f32.s32 q0, q0
-    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q8
+    vcvt.f32.s32 q1, q9
 
     vmulq.f32 q0, q0, q5 // mul scale
     vmulq.f32 q1, q1, q5
@@ -224,6 +224,19 @@ L2LoopDz:
     vmla.f32 q0, q7, d12[0]
     vmla.f32 q1, q7, d12[1]
 
+    L2_ADD_BIAS:
+    cmp lr, #0
+    beq L2_ADD_DSTV
+    vld1.f32 {q4}, [lr]! // bias
+    vadd.f32 q0, q0, q4  // bias
+    vadd.f32 q1, q1, q4
+    b L2_POST
+
+    L2_ADD_DSTV:
+    vld1.f32 {q4, q5}, [r0]
+    vadd.f32 q0, q0, q4
+    vadd.f32 q1, q1, q5
+
     L2_POST:
     ldr r6, [sp, #12] // fp32 minmax
     cmp r6, #0
@@ -334,7 +347,7 @@ L1LoopDz:
     vpadd.s32 d17, d20, d22
 
     // vaddq.s32 q0, q8, q4
-    vcvt.f32.s32 q0, q0
+    vcvt.f32.s32 q0, q8
     vmulq.f32 q0, q0, q5
     // extra scale if has
     ldr r6, [sp, #20]
diff --git a/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S b/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S
index 7c77c0fc2..9798b29bf 100644
--- a/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S
+++ b/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S
@@ -22,10 +22,10 @@ asm_function MNNAvgPoolInt8
 ldr x8, [sp, #0]
 dup v24.4s, w8
 
-sub sp, sp, #32
-str x19, [sp, #0]
-str x20, [sp, #8]
-str x21, [sp, #16]
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #16]
+stp d10, d11, [sp, #32]
+stp d8,  d9,  [sp, #48]
 
 cmp x4, #0
 ble END
@@ -206,10 +206,10 @@ cmp x2, #0
 beq END
 
 END:
-ldr x19, [sp, #0]
-ldr x20, [sp, #8]
-ldr x21, [sp, #16]
-add sp, sp, #32
+ldp d8,  d9,  [sp, #48]
+ldp d10, d11, [sp, #32]
+ldp d12, d13, [sp, #16]
+ldp d14, d15, [sp], #(16 * 4)
 
 ret
 
diff --git a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
index 0e31ad489..0027d0b75 100644
--- a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
+++ b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S
@@ -28,8 +28,11 @@ dup v31.4s, v31.s[0]     // v31: df
 fmov s30, #1.0     // v30: sf=1-df
 fsub s30, s30, s31
 movi v1.4s, #128   // s1=128
+scvtf v1.4s, v1.4s
 fmul s31, s31, s1
 fmul s30, s30, s1
+fcvtas v30.4s, v30.4s
+fcvtas v31.4s, v31.4s
 dup v31.8h, v31.h[0]
 dup v30.8h, v30.h[0]
 
diff --git a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
index 85cf65bbf..70227c97e 100644
--- a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
+++ b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S
@@ -27,6 +27,7 @@ stp x19, x20, [sp, #(16 * 6)]
 mov w15, #8       // w15: pack
 uxtw x15, w15
 movi v14.4s, #128
+scvtf v14.4s, v14.4s
 
 cmp x5, #0
 beq END
@@ -44,6 +45,9 @@ fsub v23.4s, v23.4s, v22.4s   // v23: 1-factor
 fmul v23.4s, v23.4s, v14.s[0]
 fmul v22.4s, v22.4s, v14.s[0]
 
+fcvtas v22.4s, v22.4s
+fcvtas v23.4s, v23.4s
+
 dup v30.8b, v23.b[0]   // v30: sf0
 dup v31.8b, v22.b[0]   // v31: df0
 dup v28.8b, v23.b[4]  //  v28: sf1
@@ -183,7 +187,7 @@ beq END
 
 L1Loop:
 ld1 {v31.s}[0], [x3], #4
-dup v31.4s, v31.s[0]          
+dup v31.4s, v31.s[0]
 fmov s30, #1.0
 fsub s30, s30, s31
 fmul s30, s30, s14     // (float)t -> (int16)t
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
index eda852364..0225e0b4e 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
@@ -870,6 +870,7 @@ Tile8End:
     add x0, x0, x21, LSL #3
     add x1, x1, #64
     add x27, x27, #32
+    cbnz w23, TILE_4
     add x4, x4, #64 // Revert x4 for following tile.
 
 TILE_4:
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index 897f10b40..d806e0cb9 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -3238,12 +3238,94 @@ static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions
     }
 }
 
+// fp32 <--> fp8
+static const int FP32_EXP_BIAS = 127;
+static const int FP8_EXP_BIAS = 24;   // [0, 31] --> [-24, 7] --> [1 / 2^24, 2^7]
+void MNNFp32ToFp8(uint8_t* dst, const float* src, size_t size) {
+    for (int i = 0; i < size; i++) {
+        uint32_t rawData = *((uint32_t *)(&src[i]));
+        uint32_t sign = (rawData >> 31) & 1U;
+        uint32_t exp = (int)((rawData >> 23) & 0x0ffU);
+        uint32_t mant = (rawData >> 21) & 3U;
+        int realExp = (int)exp - FP32_EXP_BIAS;
+        realExp = ALIMAX(realExp,  0 - FP8_EXP_BIAS);
+        realExp = ALIMIN(realExp, 31 - FP8_EXP_BIAS);
+        exp = (uint32_t)(realExp + FP8_EXP_BIAS);
+        dst[i] = (int8_t)((sign << 7) | (exp << 2) | mant);
+    }
+}
+void MNNFp8ToFp32(float* dst, const uint8_t* src, size_t size) {
+    for (int i = 0; i < size; i++) {
+        uint32_t sign = (src[i] >> 7) & 1U;
+        uint32_t exp = (int)((src[i] >> 2) & 0x1fU);
+        uint32_t mant = (src[i] & 3U) << 21;
+        int realExp = (int)exp - FP8_EXP_BIAS;
+        exp = (uint32_t)(realExp + FP32_EXP_BIAS);
+        uint32_t rawData = (sign << 31) | (exp << 23) | mant;
+        dst[i] = *((float *)(&rawData));
+    }
+}
+// fp16 <--> fp8
+void MNNFp16ToFp8(uint8_t* dst, const uint16_t* src, size_t size) {
+#ifdef MNN_USE_NEON
+#ifdef __aarch64__
+    int loopN = size / 16;
+    for (int i = 0; i < loopN; i++) {
+        uint8x16_t v1 = vld1q_u8((uint8_t*)(src + i * 16));
+        uint8x16_t v2 = vld1q_u8((uint8_t*)(src + i * 16 + 8));
+        uint8x16_t res = vuzp2q_u8(v1, v2);
+        vst1q_u8(dst + i * 16, res);
+    }
+    for (int i = loopN * 16; i < size; i++) {
+        dst[i] = static_cast<int8_t>(src[i] >> 8);
+    }
+#else
+    int loopN = size / 8;
+    for (int i = 0; i < loopN; i++) {
+        uint16x8_t vec = vld1q_u16(src + i * 8);
+        uint8x8_t  res = vshrn_n_u16(vec, 8);
+        vst1_u8(dst + i * 8, res);
+    }
+    for (int i = loopN * 8; i < size; i++) {
+        dst[i] = static_cast<int8_t>(src[i] >> 8);
+    }
+#endif // ARM64
+#else
+    for (int i = 0; i < size; i++) {
+        dst[i] = static_cast<int8_t>(src[i] >> 8);
+    }
+#endif // USE_NEON
+}
+void MNNFp8ToFp16(uint16_t* dst, const uint8_t* src, size_t size) {
+#ifdef MNN_USE_NEON
+    int loopN = size / 8;
+    for (int i = 0; i < loopN; i++) {
+        uint8x8_t vec8x8 = vld1_u8(src + i * 8);
+        uint16x8_t vec16x8 = vshll_n_u8(vec8x8, 8);
+        vst1q_u16(dst + i * 8, vec16x8);
+    }
+    for (int i = loopN * 8; i < size; i++) {
+        dst[i] = static_cast<int16_t>(src[i]) << 8;
+    }
+#else
+    for (int i = 0; i < size; i++) {
+        dst[i] = static_cast<int16_t>(src[i]) << 8;
+    }
+#endif // USE_NEON
+}
+
 namespace MNN {
 
 static CoreFunctions* gCoreFunction = nullptr;
 
 void MNNCoreFunctionInit() {
     gCoreFunction = new CoreFunctions;
+    // fp8
+    gCoreFunction->MNNFp32ToFp8 = MNNFp32ToFp8;
+    gCoreFunction->MNNFp16ToFp8 = MNNFp16ToFp8;
+    gCoreFunction->MNNFp8ToFp32 = MNNFp8ToFp32;
+    gCoreFunction->MNNFp8ToFp16 = MNNFp8ToFp16;
+    
     // MatMul
     gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode;
     gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A;
@@ -3426,4 +3508,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth
         areaOffset,
     };
     MNNPackInt8C2(dst, src, area, depth, offset);
-}
+}
\ No newline at end of file
diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h
index bbfdce0fa..4af1a81a8 100644
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@@ -20,6 +20,11 @@
 
 extern "C" {
 
+void MNNFp32ToFp8(uint8_t* dst, const float* src, size_t size);
+void MNNFp8ToFp32(float* dst, const uint8_t* src, size_t size);
+void MNNFp16ToFp8(uint8_t* dst, const uint16_t* src, size_t size);
+void MNNFp8ToFp16(uint16_t* dst, const uint8_t* src, size_t size);
+
 void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope);
 
 void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size, ssize_t zeroPoint);
@@ -190,6 +195,12 @@ constexpr int InputTileMax = 14; // same value from DynamicGemm.h, cannot includ
 
 namespace MNN {
 struct CoreFunctions {
+    // fp8
+    void (*MNNFp32ToFp8)(uint8_t* dst, const float* src, size_t size);
+    void (*MNNFp16ToFp8)(uint8_t* dst, const uint16_t* src, size_t size);
+    void (*MNNFp8ToFp32)(float* dst, const uint8_t* src, size_t size);
+    void (*MNNFp8ToFp16)(uint16_t* dst, const uint8_t* src, size_t size);
+
     // cpu feature
     bool supportFp16arith = false;
     bool supportSDot = false;
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
index 6471acb3a..25fb13a8f 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -18,8 +18,11 @@
 
 namespace MNN {
 
-ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res): CPUConvolution(convOp->common(), backend), mResourceInt8(res), mMutableResource(res, backend) {
-    mValid = mMutableResource.mValid;
+ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op): CPUConvolution(op->main_as_Convolution2D()->common(), backend) {}
+
+ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res): CPUConvolution(op->main_as_Convolution2D()->common(), backend), mResourceInt8(res) {
+    mMutableResource.reset(new MutableResourceInt8(res, backend));
+    mValid = mMutableResource->mValid;
 }
 
 ConvInt8TiledExecutor::~ConvInt8TiledExecutor() {
@@ -31,7 +34,7 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst)
 }
 
 ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
+    mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
     CPUConvolution::onResize(inputs, outputs);
     ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), static_cast<CPUBackend*>(backend())->int8Functions());
     return NO_ERROR;
@@ -99,7 +102,7 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
         pack = 4;
     }
     if (SRC_UNIT > pack) {
-        MNN_ASSERT(SRC_UNIT % UNIT == 0);
+        MNN_ASSERT(SRC_UNIT % pack == 0);
         shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
     } else {
         shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
@@ -116,13 +119,13 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
     return true;
 }
 
-static void Getfp32Info (std::shared_ptr<CPUConvolution::Resource> resource, std::shared_ptr<Tensor> weightOrigin, const Convolution2D* conv2d, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon) {
+static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
     // common parameters
     int outputCount = conv2d->common()->outputCount();
-    auto core = static_cast<CPUBackend*>(resource->backend)->functions();
+    auto core = static_cast<CPUBackend*>(backend)->functions();
     int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
     int ocUp4 = ROUND_UP(outputCount, core->pack);
-    
+
     int dequantCnt = quantCommon->alpha.size();
     if (quantCommon->asymmetric) {
         dequantCnt /= 2;
@@ -131,19 +134,33 @@ static void Getfp32Info (std::shared_ptr<CPUConvolution::Resource> resource, std
     int scaleSize = blockNum * ocUp4; // pack size.
     int blockSize = LSize / blockNum;
     int originOffset = 0;
+    resource->mActBits = 8;
     if (quantCommon->canUseInt4) {
         originOffset = -8;
+        resource->mActBits = 4;
     }
-
-    // Save weight quant scale and bias: wf=scale*wi+bias
+    // Save bias
+    resource->mOriginBias.reset(Tensor::createDevice<int32_t>({ocUp4})); // float
+    auto success = backend->onAcquireBuffer(resource->mOriginBias.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc bias memory error\n");
+        return;
+    }
+    ::memset(resource->mOriginBias->host<float>(), 0, ocUp4 * sizeof(float));
+    if (conv2d->bias()) {
+        ::memcpy(resource->mOriginBias->host<float>(), conv2d->bias()->data(), outputCount * sizeof(float));
+    } else {
+        ::memset(resource->mOriginBias->host<float>(), 0, ocUp4 * sizeof(float));
+    }
+    // Save weight quant alpha and zero: wf=alpha*wi+zero
     int bytes = 4;
-    resource->mDequantize.mScaleBias.reset(Tensor::createDevice<uint8_t>({2 * scaleSize * bytes}));
-    auto success = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
+    resource->mOriginScale.reset(Tensor::createDevice<uint8_t>({2 * scaleSize * bytes}));
+    success = backend->onAcquireBuffer(resource->mOriginScale.get(), Backend::STATIC);
     if (!success) {
-        MNN_ERROR("Alloc denquant scaleBias memory error\n");
+        MNN_ERROR("Alloc denquant alpha, zero memory error\n");
         return;
     }
-    auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
+    auto alphaPtr = resource->mOriginScale->host<float>();
     auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + scaleSize * bytes);
     ::memset(alphaPtr, 1, scaleSize * bytes);
     ::memset(biasPtr, 0, scaleSize * bytes);
@@ -159,7 +176,6 @@ static void Getfp32Info (std::shared_ptr<CPUConvolution::Resource> resource, std
                 dstBias[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstAlpha[j];
             }
         }
-
     } else {
         for (int i = 0; i < blockNum; ++i) {
             auto dstAlpha = alphaPtr + i * ocUp4;
@@ -173,13 +189,13 @@ static void Getfp32Info (std::shared_ptr<CPUConvolution::Resource> resource, std
     }
     // Save float weight kernel sum
     resource->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({bytes * ocUp4}));
-    success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC);
+    success = backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC);
     if (!success) {
-        MNN_ERROR("Alloc denquant mWeightKernelSum memory error\n");
+        MNN_ERROR("Alloc denquant weight kernel sum memory error\n");
         return;
     }
     auto weightKernelSum = resource->mWeightKernelSum->host<float>();
-    auto realWeightData = weightOrigin->host<int8_t>();
+    auto realWeightData = quantCommon->weight.get();
     ::memset(weightKernelSum, 0, resource->mWeightKernelSum->size());
     for (int j = 0; j < outputCount; ++j) {
         float sum = 0.f;
@@ -195,116 +211,193 @@ static void Getfp32Info (std::shared_ptr<CPUConvolution::Resource> resource, std
                 bias = 0;
             }
             int tmp = 0;
-            for (int i = 0; i < blockSize; ++i) {
-                int l_index = k * blockSize + i;
-                tmp += (int)realWeightData[j * blockNum * blockSize + l_index];
+            if (quantCommon->canUseInt4) {
+                for (int i = 0; i < blockSize; ++i) {
+                    int l_index = k * blockSize + i;
+                    int w_idx = (j * blockNum * blockSize + l_index);
+                    int w_offset = w_idx / 2;
+                    int w_mask = w_idx % 2;
+                    uint8_t s = realWeightData[w_offset];
+                    int val = w_idx % 2 ? s & 0x0f : s >> 4;
+                    tmp += (val - 8);
+                }
+            } else {
+                for (int i = 0; i < blockSize; ++i) {
+                    int l_index = k * blockSize + i;
+                    tmp += (int)realWeightData[j * blockNum * blockSize + l_index];
+                }
             }
+
             sum += (tmp * scale + blockSize * bias);
         }
         weightKernelSum[j] = sum;
     }
 }
 
-DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res, bool dynamicQuantExe) : ConvInt8TiledExecutor(backend, convOp, res) {
-    std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
-    std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon ;
-    mDynamicQuantExe = dynamicQuantExe;
-    if (dynamicQuantExe) {
-        MNN_ASSERT(convOp->quanParameter() != nullptr && convOp->quanParameter()->buffer() != nullptr);
-        quanCommon = ConvolutionCommon::load(convOp, backend, false, true);
-        // fp32 weightKernelSum
-        mResource.reset(new CPUConvolution::Resource);
-        mResource->backend = backend;
-        Getfp32Info(mResource, weightOrigin, convOp, quanCommon); // Call this before reorder weight.
-    }
-    
-    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
-    if(!mValid) {
-        return;
-    }
+DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon) : mDynamicQuantExe(true), ConvInt8TiledExecutor(backend, op) {
+    auto convOp = op->main_as_Convolution2D();
     auto core = static_cast<CPUBackend*>(backend)->int8Functions();
     auto gcore = static_cast<CPUBackend*>(backend)->functions();
-    // offline quant
-    if (false == dynamicQuantExe) {
-        mGemmKernel = core->Int8GemmKernel;
-#ifdef MNN_USE_SSE
-        int actBits = convOp->symmetricQuan()->nbits();
-        if (actBits <= 7) {
-            mGemmKernel = core->Int8GemmKernelFast;
-        }
-#else
-        if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
-            mGemmKernel = core->Int8GemmKernelFast;
-        }
-#endif
-        mResource.reset(new CPUConvolution::Resource);
-        CPUConvolution::makeResource(backend, mResource, convOp, mResourceInt8);
-        return;
-    }
-
+    mResourceInt8.reset(new CPUConvolution::ResourceInt8);
+    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
+    mMutableResource.reset(new MutableResourceInt8(mResourceInt8, backend));
     // dynamic quant
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    bool needPermuteInt4weight = ((UNIT == 8 && SRC_UNIT == 8 && DST_XUNIT ==10) || (UNIT == 4 && SRC_UNIT == 8 && DST_XUNIT ==20) || (UNIT == 64 && SRC_UNIT == 4 && DST_XUNIT ==4));
-    mResource->mDequantize.bits = 8;
-    if (quanCommon->canUseInt4) {
+    int pack = gcore->pack;
+    bool needPermuteInt4weight = ((UNIT == 8 && SRC_UNIT == 8 && DST_XUNIT ==10) || (UNIT == 64 && SRC_UNIT == 4 && DST_XUNIT ==4));
+    auto weightLength = quanCommon->weight.size();
+    int kernelCount = mCommon->kernelX() * mCommon->kernelY();
+    int oc = convOp->common()->outputCount();
+    int ic = convOp->common()->inputCount();
+    bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic);
+    if (quanCommon->canUseInt4 && directReadInt4weight) {
+        // int4 weight reorder
         mResourceInt8->mWeightAsymmetricQuant = true;
-        auto weightLength = mResourceInt8->mWeightInt8->size();
-        MNN_ASSERT(weightLength % 2 == 0);
-        mResource->mDequantize.bits = 4;
-        std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<uint8_t>( mResourceInt8->mWeightInt8->shape()));
-        auto res = mResource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
+        // shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
+        int hU = UP_DIV(oc, UNIT);
+        int lU = UP_DIV(ic, SRC_UNIT);
+        int hP = UNIT;
+        int lP = SRC_UNIT;
+        
+        // weight shape.
+        std::vector<int32_t> shape;
+        if (SRC_UNIT > pack) {
+            MNN_ASSERT(SRC_UNIT % pack == 0);
+            shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
+        } else {
+            shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
+        }
+        mResourceInt8->mWeightInt8.reset(Tensor::createDevice<uint8_t>(shape));
+
+        auto res = backend->onAcquireBuffer(mResourceInt8->mWeightInt8.get(), Backend::STATIC);
         if (!res) {
             MNN_ERROR("int4 weight acquire buffer error\n");
             return ;
         }
-        auto srcPtr = mResourceInt8->mWeightInt8->host<int8_t>();
-        auto dstPtr = weightLow->host<uint8_t>();
+        auto srcPtr = (uint8_t*)quanCommon->weight.get();
+        auto dstPtr = mResourceInt8->mWeightInt8->host<uint8_t>();
+        ::memset(dstPtr, 0, mResourceInt8->mWeightInt8->size());
+
         // Pack two int4-weight to one int8-weight.
         if (false == needPermuteInt4weight) {
-            weightLength = UP_DIV(weightLength, 2);
-            for (int i=0; i < weightLength; ++i) {
-                int s0 = srcPtr[2 * i + 0];
-                int s1 = srcPtr[2 * i + 1];
-                int d = (s0 + 8) * 16 + (s1 + 8);
-                dstPtr[i] = d;
+            for (int i = 0; i < hU; i++) {
+                for (int j = 0; j < lU; j++) {
+                    for (int k = 0; k < hP; k++) {
+                        for (int id = 0; id < lP / 2; ++id) {
+                            dstPtr[(i * lU * lP * hP + j * hP * lP + k * lP) / 2 + id] = srcPtr[((i * hP + k) * lP * lU + (j * lP)) / 2 + id];
+                        }
+                    }
+                }
             }
         } else {
-            int permuteUnit = UNIT * SRC_UNIT;
-            int halfPermuteStride = static_cast<int32_t>(permuteUnit / 2);
-            for (int i = 0; i < weightLength / permuteUnit; ++i) {
-                auto src0 = srcPtr + i * permuteUnit;
-                auto dst0 = dstPtr + i * halfPermuteStride;
-                for (int j = 0; j < halfPermuteStride; ++j) {
-                    int s0 = src0[j];
-                    int s1 = src0[j + halfPermuteStride];
+            for (int i = 0; i < hU; i++) {
+                for (int j = 0; j < lU; j++) {
+                    auto dst_ptr = dstPtr + (i * lU * lP * hP + j * hP * lP) / 2;
+                    for (int k = 0; k < 16; k++) {
+                        int col = k % 4;
+                        int row = k / 4;
+                        uint8_t s0 = srcPtr[((i * hP + row + 0) * lP * lU + j * lP) / 2 + col];
+                        uint8_t s1 = srcPtr[((i * hP + row + 4) * lP * lU + j * lP) / 2 + col];
+                        uint8_t d0 = (s0 & 0xf0) | (s1 >> 4);
+                        uint8_t d1 = (s0 << 4) | (s1 & 0x0f);
+                        dst_ptr[k * 2 + 0] = d0;
+                        dst_ptr[k * 2 + 1] = d1;
+                    }
+               }
+            }
+        }
+    } else {
+        // std::shared_ptr<Tensor> srcWeight;
+        
+        if (quanCommon->canUseInt4) {
+            mResourceInt8->mWeightAsymmetricQuant = true;
+            auto srcPtr = reinterpret_cast<uint8_t*>(quanCommon->weight.get());
+            std::vector<int8_t> tmpWeight(weightLength * 2, 0);
+            for (int i = 0; i < weightLength; ++i) {
+                int8_t s0 = (srcPtr[i] >> 4) - 8; // For int4 quant weight, +8 saved in quant buffer
+                int8_t s1 = (srcPtr[i] & 0x0f) - 8;
+                tmpWeight[2 * i + 0] = s0;
+                tmpWeight[2 * i + 1] = s1;
+            }
+            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            if(!mValid) {
+                return;
+            }
+            MNN_ASSERT(mResourceInt8->mWeightInt8->size() % 2 == 0);
+            int leng = mResourceInt8->mWeightInt8->size();
+            int halflen = leng / 2;
+            std::shared_ptr<Tensor> weightLow(Tensor::create<uint8_t>({halflen}));
+            auto dstint4Ptr = weightLow->host<uint8_t>();
+            auto srcint4Ptr = mResourceInt8->mWeightInt8->host<int8_t>();
+            if (false == needPermuteInt4weight) {
+                for (int i=0; i < halflen; ++i) {
+                    int s0 = srcint4Ptr[2 * i + 0];
+                    int s1 = srcint4Ptr[2 * i + 1];
                     int d = (s0 + 8) * 16 + (s1 + 8);
-                    dst0[j] = d;
+                    dstint4Ptr[i] = d;
                 }
+            } else {
+                int permuteUnit = UNIT * SRC_UNIT;
+                int halfPermuteStride = static_cast<int32_t>(permuteUnit / 2);
+                for (int i = 0; i < leng / permuteUnit; ++i) {
+                    auto src0 = srcint4Ptr + i * permuteUnit;
+                    auto dst0 = dstint4Ptr + i * halfPermuteStride;
+                    for (int j = 0; j < halfPermuteStride; ++j) {
+                        int s0 = src0[j];
+                        int s1 = src0[j + halfPermuteStride];
+                        int d = (s0 + 8) * 16 + (s1 + 8);
+                        dst0[j] = d;
+                    }
+                }
+            }
+            // Update int4 weight to mWeightInt8.
+            mResourceInt8->mWeightInt8 = weightLow;
+        } else {
+            std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            if(!mValid) {
+                return;
             }
         }
-        // Update int4 weight to mWeightInt8.
-        mResourceInt8->mWeightInt8 = weightLow;
     }
+
     // Relu/Relu6 post parameters
     auto postPtr = getPostParameters();
-    mResource->mReluThreshold.resize(2);
-    mResource->mReluThreshold[0] = postPtr[2];
-    mResource->mReluThreshold[1] = postPtr[3];
+    mResourceInt8->mReluThreshold.resize(2);
+    mResourceInt8->mReluThreshold[0] = postPtr[2];
+    mResourceInt8->mReluThreshold[1] = postPtr[3];
     if (gcore->bytes == 2) {
-        gcore->MNNFp32ToLowp(mResource->mReluThreshold.data(), reinterpret_cast<int16_t*>(mResource->mReluThreshold.data()), 2);
+        gcore->MNNFp32ToLowp(mResourceInt8->mReluThreshold.data(), reinterpret_cast<int16_t*>(mResourceInt8->mReluThreshold.data()), 2);
     }
-    if (mCommon->relu()) {
-        mResource->mReluThreshold[0] = 0.f;
+}
+
+DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : mDynamicQuantExe(false), ConvInt8TiledExecutor(backend, op, res) {
+    std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
+    auto convOp = op->main_as_Convolution2D();
+    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
+    if(!mValid) {
+        return;
     }
-    if (mCommon->relu6()) {
-        mResource->mReluThreshold[0] = 0.f;
-        mResource->mReluThreshold[1] = 6.f;
+    // offline quant: choose int8 gemm kernel
+    auto core = static_cast<CPUBackend*>(backend)->int8Functions();
+    mGemmKernel = core->Int8GemmKernel;
+#ifdef MNN_USE_SSE
+    int actBits = convOp->symmetricQuan()->nbits();
+    if (actBits <= 7) {
+        mGemmKernel = core->Int8GemmKernelFast;
     }
+#else
+    if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
+        mGemmKernel = core->Int8GemmKernelFast;
+    }
+#endif
+    CPUConvolution::makeResourceNew(backend, convOp, mResourceInt8);
 }
 
-DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, bool dynamicQuantExe, const DenseConvInt8TiledExecutor& exe)
-    : ConvInt8TiledExecutor(backend, convOp, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel), mResource(exe.mResource), mDynamicQuantExe(dynamicQuantExe) {
+DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe)
+    : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel), mDynamicQuantExe(exe.mDynamicQuantExe) {
 }
 
 DenseConvInt8TiledExecutor::~DenseConvInt8TiledExecutor() {
@@ -315,7 +408,7 @@ bool DenseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution**
     if (nullptr == dst) {
         return true;
     }
-    auto exe = new DenseConvInt8TiledExecutor(bn, op->main_as_Convolution2D(), mDynamicQuantExe, *this);
+    auto exe = new DenseConvInt8TiledExecutor(bn, op, *this);
     if (!exe->valid()) {
         return false;
     }
@@ -342,7 +435,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
 
     if (mDynamicQuantExe == false) {
-        mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
+        mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
         CPUConvolution::onResize(inputs, outputs);
         ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
         mBlockNum = 1;
@@ -350,18 +443,18 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         CPUConvolution::onResize(inputs, outputs);
         // Gemm Kernel
         mGemmKernel = core->Int8GemmKernel;
-        if (mResource->mDequantize.bits == 4) {
+        if (mResourceInt8->mActBits == 4) {
             mGemmKernel = core->Int8GemmKernel_W4;
         }
         mQuantFunc = core->MNNFloat2Int8;
         if (gcore->bytes == 2 && gcore->pack == 8) {
             mGemmKernel = core->MNNGemmInt8AddBiasScale_Unit_FP16;
-            if (mResource->mDequantize.bits == 4) {
+            if (mResourceInt8->mActBits == 4) {
                 mGemmKernel = core->MNNGemmInt8AddBiasScale_w4_Unit_FP16;
             }
             mQuantFunc = core->DynamicQuanInput_ARM82;
             mQuantAndReorderFunc = core->DynamicQuanInputAndReorder_ARM82;
-            
+
         }
         // A axisSum kernel
         mSumByAxisLFunc = gcore->MNNSumByAxisLForMatmul_A;
@@ -371,10 +464,10 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
             ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
         }
         int ocUp4 = ROUND_UP(outputs[0]->channel(), gcore->pack);
-        int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2);
+        int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
         mBlockNum  = alphaSize / ocUp4;
     }
-    
+
     // input scale buffer
     int batch = inputs[0]->batch();
 //    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT * mIm2ColCount * mResourceInt8->mWeightInt8->length(1) * SRC_UNIT}));
@@ -398,23 +491,30 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         tileLimit = ALIMIN(tileLimitByC, planeSize);
         auto ocPerThread = UP_DIV(outC4, threads);
         auto threadNeed = UP_DIV(outC4, ocPerThread);
+        int totalWork = outC4;
+        int part = 1;
         if (UNIT > gcore->pack) { // AVX512:UNIT=64,pack=16
             MNN_ASSERT(UNIT % gcore->pack == 0);
             int ocDivUnit = UP_DIV(outC4 * gcore->pack, UNIT);
             ocPerThread = UP_DIV(ocDivUnit, threads);
             threadNeed  = UP_DIV(ocDivUnit, ocPerThread);
+            totalWork = ocDivUnit;
+            part = UNIT / gcore->pack;
         }
         mThreadNums = ALIMIN(threads, threadNeed);
         mSplitByOc = true;
 
         mDivides.resize(threads+1);
         mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(outC4, mDivides.data() + 1);
+        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
+        for (int i = 0; i < mDivides.size(); ++i) {
+            mDivides[i] *= part;
+        }
     }
     mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT);
     auto DynamicDestUnit = DST_XUNIT * mIm2ColCount;
     mTileCount        = UP_DIV(planeSize, DynamicDestUnit);
-    
+
     if (threads < planeSize) {
         mThreadNums = ALIMIN(threads, mTileCount);
         mDivides.resize(threads+1);
@@ -422,14 +522,16 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
     }
     int ocUp4 = ROUND_UP(outC, gcore->pack);
-    int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2);
+    // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
+    int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
 
     auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
     auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
     mBlitInfoStride = blitInfoSize.second;
     mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
-    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT * mIm2ColCount * mResourceInt8->mWeightInt8->length(1) * SRC_UNIT}));
-    mTempSrcSum.resize(mThreadNums * mBlockNum * DST_XUNIT * mIm2ColCount * 4); // Use 4 bytes to save kernel sum.
+    auto icDiv4KernelCount = mIm2ColParamter.kernelCountUnit;
+    mTempIm2ColBuffer.reset(Tensor::createDevice<int8_t>({threads, DST_XUNIT * mIm2ColCount * icDiv4KernelCount * SRC_UNIT}));
+    mTempSrcSum.resize(threads * mBlockNum * DST_XUNIT * mIm2ColCount * 4); // Use 4 bytes to save kernel sum.
 
     success &= backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
     if (!success || mBlitInfo.invalid()) {
@@ -442,7 +544,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         return NO_ERROR;
     }
 
-    
+
     int inC   = inputs[0]->channel();
     // set im2col tensor info
     mQuantInput.reset((Tensor::createDevice<int8_t>({batch, mIm2ColParamter.ih, mIm2ColParamter.iw, ROUND_UP(inC, gcore->pack)})));
@@ -451,12 +553,12 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
     // set compute buffer
     mDynamicBias.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
     mScaleFuse.reset(Tensor::createDevice<uint8_t>({alphaSize * 4}));
-    
+
     success &= backend()->onAcquireBuffer(mQuantInput.get(), Backend::DYNAMIC);
     success &= backend()->onAcquireBuffer(mDynamicBias.get(), Backend::DYNAMIC);
     success &= backend()->onAcquireBuffer(mTempMaxMinValueBuffer.get(), Backend::DYNAMIC);
     success &= backend()->onAcquireBuffer(mScaleFuse.get(), Backend::DYNAMIC);
-    
+
     if (mUseBatchQuan) {
         int infobytes = 4; // use float32 to save dequant scale and quant scale.
         int size = mThreadNums * batch * gcore->bytes + 2 * batch * infobytes;
@@ -502,7 +604,8 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     const auto col_buffer_unit_size  = kernelCountUnitDouble * DST_XUNIT * SRC_UNIT * sizeof(int8_t);
     const auto col_buffer_size       = col_buffer_unit_size * mIm2ColCount;
     const int dstBytes               = static_cast<CPUBackend*>(backend())->getBytes(backend(), output);
-    const int alphaSize              = mResource->mDequantize.mScaleBias->size() / (4 * 2);
+    // const int alphaSize              = mResource->mDequantize.mScaleBias->size() / (4 * 2);
+    const int alphaSize              = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
     const int blockL                  = kernelCountUnitDouble / mBlockNum; // source depthQuad for each block.
     float weightBytes                = 1.f;
     int weight_step_Y                = weightBytes * (UNIT__ * SRC_UNIT);
@@ -512,15 +615,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
     const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
     auto srcKernelSumPtr     = mTempSrcSum.data();
-    auto weightDequantBias = mResource->mDequantize.mScaleBias->host<uint8_t>() + alphaSize * 4;
+    auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
 
     auto outputDataPtr = output->host<int8_t>();
-    auto biasPtr       = mMutableResource.mBiasFloat->host<uint8_t>();
-    auto scalePtr      = mMutableResource.mScaleFloat->host<uint8_t>();
+    auto biasPtr       = mMutableResource->mBiasFloat->host<uint8_t>();
+    auto scalePtr      = mMutableResource->mScaleFloat->host<uint8_t>();
 
-    auto inputZeroPoint  = mMutableResource.mInputZeroPoint;
+    auto inputZeroPoint  = mMutableResource->mInputZeroPoint;
     auto inputScalePtr = mInputDeqScales->host<uint8_t>();
-    (reinterpret_cast<float*>(inputScalePtr))[0]     =  mMutableResource.mInputScale;
+    (reinterpret_cast<float*>(inputScalePtr))[0]     =  mMutableResource->mInputScale;
 
     auto SingleDynamicQuant = [&] () {
         const auto floatptr = input->host<float>();
@@ -583,16 +686,16 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         }
 
         /* bias float */
-        #ifdef MNN_USE_SSE
+    #ifdef MNN_USE_SSE
         int offset = 128;
     #else
         int offset = 0;
     #endif
-        auto biasfp32 = mMutableResource.mResource->mOriginBias->host<float>();
-        auto weightDequantScale = mResource->mDequantize.mScaleBias->host<float>();
+        auto biasfp32 = mMutableResource->mResource->mOriginBias->host<float>();
+        auto weightDequantScale = mResourceInt8->mOriginScale->host<float>();
         float zerofp32 = (zeropoint + offset) * dequantscale;
 
-        gcore->MNNDynamicUpdateConvBiasScale(mDynamicBias->host<float>(), mScaleFuse->host<float>(), biasfp32, weightDequantScale, &dequantscale, mResource->mWeightKernelSum->host<float>(), &zerofp32, UP_DIV(output->channel(), 4), alphaSize);
+        gcore->MNNDynamicUpdateConvBiasScale(mDynamicBias->host<float>(), mScaleFuse->host<float>(), biasfp32, weightDequantScale, &dequantscale, mResourceInt8->mWeightKernelSum->host<float>(), &zerofp32, UP_DIV(output->channel(), 4), alphaSize);
         // Move step for A and B for each block computing
 
         inputZeroPoint = zeropoint;
@@ -643,12 +746,12 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
             gcore->MNNDynamicQuant(inputData, int8ptr, scale_ptr, workCount, batch, PackUnit);
         }
         MNN_CONCURRENCY_END();
-        
+
         inputZeroPoint = 0;
         inputScalePtr     = (uint8_t*)dequantPtr;
         inputDataPtr = mQuantInput->host<int8_t>();
-        biasPtr = mMutableResource.mResource->mOriginBias->host<uint8_t>();
-        scalePtr = mResource->mDequantize.mScaleBias->host<uint8_t>();
+        biasPtr = mMutableResource->mResource->mOriginBias->host<uint8_t>();
+        scalePtr = mResourceInt8->mOriginScale->host<uint8_t>();
     };
     ssize_t oneScale = 1;
     if (mUseBatchQuan) {
@@ -659,12 +762,13 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     } else {
         // offline quant.
     }
-    
-    if (mResource->mDequantize.bits == 4) {
+
+
+    if (mResourceInt8->mActBits == 4) {
         weightBytes   = 0.5;
         weight_step_Y *= 0.5;
     }
-    
+
     SumByAxisParams sumParams;
     sumParams.oneScale = oneScale;
     sumParams.SRC_UNIT = SRC_UNIT;
@@ -672,13 +776,13 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     sumParams.DST_XUNIT = DST_XUNIT;
     sumParams.col_buffer_unit_size = col_buffer_unit_size;
     sumParams.kernelCountUnitDouble = kernelCountUnitDouble;
-    
+
     auto ThreadFunction = [&](int tId, int eStartIndex, int eEndIndex, int estep, int ocIndex) {
         auto ocDivThread = ocDiv4;
         if (mSplitByOc) { // Thread split by OC
             ocDivThread = ALIMIN(mDivides[tId + 1] - mDivides[tId], ocDiv4 - mDivides[tId]);
         }
-        float* reluPtr = mResource->mReluThreshold.data();
+        float* reluPtr = mResourceInt8->mReluThreshold.data();
         uint8_t* extraScale = nullptr; // input scale for batch dynamic quant.
         QuanPostTreatParameters quanParam;
         quanParam.blockNum = mBlockNum;
@@ -686,17 +790,17 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
             extraScale = inputScalePtr;
         }
 #ifdef MNN_USE_SSE
-        quanParam.extraBias = mResource->mWeightKernelSum->host<float>() + ocIndex;
+        quanParam.extraBias = mResourceInt8->mWeightKernelSum->host<float>() + ocIndex;
 #endif
         if (dstBytes != 1) {
             quanParam.useInt8 = 0;
             quanParam.fp32minmax = reluPtr;
         } else {
-            quanParam.maxValue = mMutableResource.mClampMax;
+            quanParam.maxValue = mMutableResource->mClampMax;
             if (mResourceInt8->mRelu) {
-                quanParam.minValue = mMutableResource.mOutputZeroPoint;
+                quanParam.minValue = mMutableResource->mOutputZeroPoint;
             } else {
-                quanParam.minValue = mMutableResource.mClampMin;
+                quanParam.minValue = mMutableResource->mClampMin;
             }
         }
         auto outputTid = outputDataPtr + ocIndex * plane * dstBytes;
@@ -752,6 +856,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
                     int step = ALIMIN(DST_XUNIT, realDstCount);
                     quanParam.srcKernelSum = ptrX;
                     quanParam.extraScale = extraScale != nullptr ? (float*)ptrExtraScale : nullptr;
+                    // printf("step=%d, ocDivThread=%d\n", step, ocDivThread);
                     mGemmKernel(outputInTilePtr, colAddrTemp, weightPtrTid, kernelCountUnitDouble, dstZStep * dstBytes, ocDivThread, &quanParam, step);
                     ptrX += step;
                     realDstCount-=step;
@@ -787,20 +892,21 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
             }
         }
     };
-
+    const int threads = static_cast<CPUBackend*>(backend())->threadNumber();
     if (!mSplitByOc) {
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
+        MNN_CONCURRENCY_BEGIN(tId, threads) {
             ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0);
         }
         MNN_CONCURRENCY_END();
     } else {
-        MNN_CONCURRENCY_BEGIN(tId, mThreadNums) {
+        MNN_CONCURRENCY_BEGIN(tId, threads) {
             int ocIndex = PackUnit * mDivides[tId];
-            ThreadFunction((int)tId, 0, mTileCount,1, ocIndex);
+            if (ocIndex < ocUp4) {
+                ThreadFunction((int)tId, 0, mTileCount,1, ocIndex);
+            }
         }
         MNN_CONCURRENCY_END();
     }
-
     return NO_ERROR;
 }
 
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
index d4524837c..c5fc5f4d3 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -18,7 +18,8 @@ namespace MNN {
 class ConvInt8TiledExecutor : public CPUConvolution {
 public:
     // given weight+bias+scale, do post process
-    ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res);
+    ConvInt8TiledExecutor(Backend* backend, const Op* op);
+    ConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res);
     virtual ~ConvInt8TiledExecutor();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
@@ -31,8 +32,7 @@ class ConvInt8TiledExecutor : public CPUConvolution {
     int mThreadNums;
     std::shared_ptr<Tensor> mTempIm2ColBuffer;
     std::shared_ptr<CPUConvolution::ResourceInt8> mResourceInt8;
-    // std::shared_ptr<CPUConvolution::Resource> mResource;
-    CPUConvolution::MutableResourceInt8 mMutableResource;
+    std::shared_ptr<CPUConvolution::MutableResourceInt8> mMutableResource;
     MemChunk mBlitInfo;
     std::pair<size_t, size_t> mBlitInfoStride;
     int mIm2ColCount;
@@ -50,14 +50,15 @@ class ConvInt8TiledExecutor : public CPUConvolution {
 class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
 public:
     // given weight+bias+scale, do post process
-    DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res, bool dynamicQuantExe);
+    DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res); // ptq
+    DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon); // dynamic quant
     virtual ~DenseConvInt8TiledExecutor();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) override;
 private:
-    DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* common, bool dynamicQuantExe, const DenseConvInt8TiledExecutor& exe);
+    DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe);
 
     decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
     std::function<void(const float*, int8_t*, size_t, const float*, ssize_t, ssize_t, ssize_t)> mQuantFunc;
@@ -69,7 +70,6 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     std::shared_ptr<Tensor> mBatchQuantInfo;
     std::shared_ptr<Tensor> mInputDeqScales;
     std::shared_ptr<Tensor> mTempMaxMinValueBuffer;
-    std::shared_ptr<CPUConvolution::Resource> mResource;
     std::vector<uint8_t> mTempSrcSum;
     std::vector<int32_t> mDivides;
 
diff --git a/source/backend/cpu/compute/ConvInt8Winograd.cpp b/source/backend/cpu/compute/ConvInt8Winograd.cpp
index 2d0a4b5f2..433b88812 100644
--- a/source/backend/cpu/compute/ConvInt8Winograd.cpp
+++ b/source/backend/cpu/compute/ConvInt8Winograd.cpp
@@ -23,17 +23,20 @@ namespace MNN {
 
 std::shared_ptr<ConvInt8Winograd::WinoResource> ConvInt8Winograd::makeWinoResource(const int8_t* originWeight, std::shared_ptr<Tensor> scaleFloat, const int32_t* attr, Backend* backend, int oc, int ic, int kernelY, int kernelX) {
     auto core = static_cast<CPUBackend*>(backend)->int8Functions();
+    auto gcore = static_cast<CPUBackend*>(backend)->functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    int oc4 = UP_DIV(oc, UNIT), ic4 = UP_DIV(ic, SRC_UNIT);
+    int pack = gcore->pack;
+    int ocDivUnit = UP_DIV(oc, UNIT), ic4 = UP_DIV(ic, SRC_UNIT);
+    int oc4 = UP_DIV(oc, pack);
     int kySize = attr[2], kxSize = attr[3], unitY = attr[4], unitX = attr[5]; attr += 6;
     int alphaY = kySize + unitY - 1, alphaX = kxSize + unitX - 1, alpha2 = alphaY * alphaX;
     
     std::shared_ptr<Tensor> weight, offsets, scales, inputScales;
-    weight.reset(Tensor::createDevice<int8_t>({alpha2, oc4, ic4, UNIT, SRC_UNIT}));
-    offsets.reset(Tensor::createDevice<float>({alpha2, oc4, UNIT}));
-    scales.reset(Tensor::createDevice<float>({alpha2, oc4 * UNIT}));
-    inputScales.reset(Tensor::createDevice<float>({alpha2, UNIT}));
+    weight.reset(Tensor::createDevice<int8_t>({alpha2, ocDivUnit, ic4, UNIT, SRC_UNIT}));
+    offsets.reset(Tensor::createDevice<float>({alpha2, oc4, pack}));
+    scales.reset(Tensor::createDevice<float>({alpha2, oc4 * pack}));
+    inputScales.reset(Tensor::createDevice<float>({alpha2, pack}));
     
     auto allocTensors = [=](std::vector<std::shared_ptr<Tensor>> tensors) -> bool {
         bool success = true;
@@ -54,8 +57,8 @@ std::shared_ptr<ConvInt8Winograd::WinoResource> ConvInt8Winograd::makeWinoResour
     auto weightScaleData = (const float*)attr; attr += alpha2 * oc;
     for (int i = 0; i < alpha2; ++i) {
         auto scale = 1.0f / inputScaleData[i];
-        for (int u = 0; u < UNIT; ++u) {
-            inputScales->host<float>()[i * UNIT + u] = scale;
+        for (int u = 0; u < pack; ++u) {
+            inputScales->host<float>()[i * pack + u] = scale;
         }
     }
     
@@ -86,7 +89,7 @@ std::shared_ptr<ConvInt8Winograd::WinoResource> ConvInt8Winograd::makeWinoResour
             float scale = weightScaleData[a * oc + oz];
             for (int sz = 0; sz < ic; ++sz) {
                 int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
-                int index = (((a * oc4 + oz4) * ic4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
+                int index = (((a * ocDivUnit + oz4) * ic4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
                 float srcData = weightFloat->host<float>()[(a * oc + oz) * ic + sz];
                 // -ffast-math may cause inexact input then wrong rounded result, add eps to avoid this
                 float eps = ((srcData/scale) > 0 ? 1 : -1) * 1e-6;
@@ -97,8 +100,9 @@ std::shared_ptr<ConvInt8Winograd::WinoResource> ConvInt8Winograd::makeWinoResour
                 offset += quanData * (-128);
 #endif
             }
-            offsets->host<float>()[a * oc4 * UNIT + oz] = offset * scale * inputScaleData[a];
-            scales->host<float>()[a * oc4 * UNIT + oz] = scale * inputScaleData[a];
+            
+            offsets->host<float>()[a * oc4 * pack + oz] = offset * scale * inputScaleData[a];
+            scales->host<float>()[a * oc4 * pack + oz] = scale * inputScaleData[a];
         }
     }
     backend->onReleaseBuffer(originWeightFloat.get(), Backend::STATIC);
@@ -184,6 +188,7 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     UNIT = gcore->pack;
+    int pack = gcore->pack;
     
     auto input = mInputFloat.get(), output = outputs[0];
     int batch = input->batch(), ic = input->channel(), oc = output->channel();
@@ -197,7 +202,7 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
     }
     for (auto& unit : mUnits) {
         int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
-        auto srcChunk = TensorUtils::getDescribeOrigin(input)->mem->chunk() + (sy * iw + sx) * UNIT;
+        auto srcChunk = TensorUtils::getDescribeOrigin(input)->mem->chunk() + (sy * iw + sx) * pack;
         unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
         TensorUtils::getDescribeOrigin(unit.input.get())->mem = (new CPUMemObj(nullptr, srcChunk, 0));
         for (int i = 0; i < input->dimensions(); ++i) {
@@ -223,14 +228,14 @@ static void mergeAddBiasScaleQuantize(const std::vector<Tensor*>& inputs, Tensor
     auto coreInt8 = cpuBn->int8Functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    UNIT = core->pack;
+    int pack = core->pack;
     
-    int countC4 = UP_DIV(output->channel(), UNIT), plane = output->height() * output->width() * output->batch();
+    int countC4 = UP_DIV(output->channel(), pack), plane = output->height() * output->width() * output->batch();
     auto mergeFloat = inputs[0]->host<float>();
     for (int i = 1; i < inputs.size(); ++i) {
         core->MNNMatrixAdd(mergeFloat, mergeFloat, inputs[i]->host<float>(), plane * countC4, 0, 0, 0, 1);
     }
-    std::vector<float> fakeScale(countC4 * UNIT, 1);
+    std::vector<float> fakeScale(countC4 * pack, 1);
     core->MNNScaleAndAddBias(mergeFloat, mergeFloat, quanParam->biasFloat, fakeScale.data(), plane, countC4);
     coreInt8->MNNFloat2Int8(mergeFloat, output->host<int8_t>(), plane * countC4, quanParam->scale, quanParam->minValue, quanParam->maxValue, zeroPoint);
 }
@@ -282,7 +287,8 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
     auto gcore = bn->functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    UNIT = gcore->pack;
+    // UNIT = gcore->pack;
+    int pack = gcore->pack;
     // scale, zero, min, max
     auto inputQuant = TensorUtils::getQuantInfo(inputs[0]);
     auto outputQuant = TensorUtils::getQuantInfo(outputs[0]);
@@ -299,9 +305,9 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
         };
     }
 
-    std::vector<float> scale(UNIT, inputQuant[0]);
+    std::vector<float> scale(pack, inputQuant[0]);
     int size = bn->getTensorSize(mInputFloat.get());
-    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
+    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / pack, inputQuant[1]);
     std::vector<Tensor*> tmp_outputs;
     for (auto& unit : mUnits) {
         unit.input->buffer().host = TensorUtils::getDescribeOrigin(unit.input.get())->mem->chunk().ptr();
@@ -312,7 +318,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
         tmp_outputs.push_back(unit.output.get());
     }
     QuanPostTreatParameters quanParam;
-    scale.assign(UNIT, 1.0 / outputQuant[0]);
+    scale.assign(pack, 1.0 / outputQuant[0]);
     quanParam.scale = scale.data();
     // For winograd Int8, will not treat origin bias to int32, use float directly
     quanParam.biasFloat = mResource->mOriginBias->host<float>();
@@ -333,14 +339,14 @@ ConvInt8Winograd::WinoExecution::WinoExecution(std::shared_ptr<WinoResource> res
     
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-    UNIT = gcore->pack;
+    int pack = gcore->pack;
 
     int threadNumber = ((CPUBackend *)backend())->threadNumber();
     int alphaY = mUnitY + mKernelY - 1, alphaX = mUnitX + mKernelX - 1, alpha2 = alphaY * alphaX;
-    int ic4 = UP_DIV(inputCount, SRC_UNIT), oc4 = UP_DIV(outputCount, UNIT);
+    int ic4 = UP_DIV(inputCount, SRC_UNIT), oc4 = UP_DIV(outputCount, pack);
     mTempInputBuffer.reset(Tensor::createDevice<int8_t>({threadNumber, alpha2, ic4, DST_XUNIT * SRC_UNIT}));
-    mTempOutputBuffer.reset(Tensor::createDevice<float>({threadNumber, alpha2, oc4, DST_XUNIT * UNIT}));
-    int midSize = alpha2 * DST_XUNIT * ALIMAX(ROUND_UP(inputCount, UNIT), oc4 * UNIT);
+    mTempOutputBuffer.reset(Tensor::createDevice<float>({threadNumber, alpha2, oc4, DST_XUNIT * pack}));
+    int midSize = alpha2 * DST_XUNIT * ALIMAX(ROUND_UP(inputCount, pack), oc4 * pack);
     mTransformMidBuffer.reset(Tensor::createDevice<float>({threadNumber, 3, midSize}));
 }
 ConvInt8Winograd::WinoExecution::WinoExecution(Backend* bn, const WinoExecution& exe)
@@ -374,6 +380,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
     int UNIT, SRC_UNIT, DST_XUNIT;
     coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     UNIT = core->pack;
+    int pack = core->pack;
     
     auto gemmFunc = coreInt8->Int8GemmKernel;
     CoreFunctions::WinoUnrollTransFunc srcTransXFunc = nullptr, srcTransYFunc = nullptr;
@@ -395,8 +402,8 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
     
     int ow = output->width(), oh = output->height();
     int iw = input->width(), ih = input->height();
-    int ic = input->channel(), ic_4 = UP_DIV(ic, UNIT);
-    int dc_4 = UP_DIV(output->channel(), UNIT);
+    int ic = input->channel(), ic_4 = UP_DIV(ic, pack);
+    int dc_4 = UP_DIV(output->channel(), pack);
 
     int padY = mPadY, padX = mPadX;
     auto wUnit = UP_DIV(ow, mUnitX), hUnit = UP_DIV(oh, mUnitY);
@@ -418,9 +425,9 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
         for (int hbIndex=oybBegin; hbIndex <= oybEnd; ++hbIndex) {
             auto hIndex = hbIndex % hUnit;
             auto bIndex = hbIndex / hUnit;
-            auto bOffset = iw * ih * UNIT * bIndex;
+            auto bOffset = iw * ih * pack * bIndex;
             auto srcBatch = srcOrigin + bOffset;
-            int dstZStep = DST_XUNIT * UNIT, unitStep = dstZStep * ic_4;
+            int dstZStep = DST_XUNIT * pack, unitStep = dstZStep * ic_4;
             int step = std::min(wUnit - oxBegin, remain);
             int srcY  = hIndex * mUnitY - padY;
             int ey    = ALIMIN(srcY + alphaY, ih) - srcY;
@@ -447,38 +454,38 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                 int srcX  = wIndex * mUnitX - padX;
                 int sx    = ALIMAX(0, srcX) - srcX;
                 int ex    = ALIMIN(srcX + alphaX, iw) - srcX;
-                auto dst_x = dstOrigin + si * UNIT;
+                auto dst_x = dstOrigin + si * pack;
                 
-                int sourceZStep = iw * ih * UNIT * batch, sourceYStep = iw * UNIT;
-                auto srcStart = srcBatch + srcY * sourceYStep + srcX * UNIT;
+                int sourceZStep = iw * ih * pack * batch, sourceYStep = iw * pack;
+                auto srcStart = srcBatch + srcY * sourceYStep + srcX * pack;
                 // when input window exceed limit (so need pad value), copy from src to midbuffer0
                 if (ex - sx != alphaX || ey - sy != alphaY) {
-                    ::memset(midBuffer0, 0, alpha2 * ic_4 * UNIT * sizeof(float));
-                    int count = UNIT * (ex - sx);
+                    ::memset(midBuffer0, 0, alpha2 * ic_4 * pack * sizeof(float));
+                    int count = pack * (ex - sx);
                     for (int z = 0; count > 0 && z < ic_4; ++z) {
                         for (int yy = sy; yy < ey; ++yy) {
-                            auto dst_yy = midBuffer0 + ((z * alphaY + yy) * alphaX + sx) * UNIT;
-                            auto src_yy = srcStart + z * sourceZStep + yy * sourceYStep + sx * UNIT;
+                            auto dst_yy = midBuffer0 + ((z * alphaY + yy) * alphaX + sx) * pack;
+                            auto src_yy = srcStart + z * sourceZStep + yy * sourceYStep + sx * pack;
                             ::memcpy(dst_yy, src_yy, count * sizeof(float));
                         }
                     }
                     srcStart = midBuffer0;
-                    sourceZStep = alpha2 * UNIT;
-                    sourceYStep = alphaX * UNIT;
+                    sourceZStep = alpha2 * pack;
+                    sourceYStep = alphaX * pack;
                 }
                 for (int sz = 0; sz < ic_4; ++sz) {
                     for (int s = 0; s < sStep; ++s) {
-                        auto dst = dst_x + sz * dstZStep + s * UNIT;
-                        auto src = srcStart + sz * sourceZStep + s * mUnitX * UNIT;
-                        srcTransXFunc(src, midBuffer1, sourceYStep, alphaX * UNIT, UNIT, UNIT);
-                        srcTransYFunc(midBuffer1, dst, UNIT, unitStep, alphaX * UNIT, alphaX * unitStep);
+                        auto dst = dst_x + sz * dstZStep + s * pack;
+                        auto src = srcStart + sz * sourceZStep + s * mUnitX * pack;
+                        srcTransXFunc(src, midBuffer1, sourceYStep, alphaX * pack, pack, pack);
+                        srcTransYFunc(midBuffer1, dst, pack, unitStep, alphaX * pack, alphaX * unitStep);
                     }
                 }
                 si += sStep;
             }
             oxBegin = 0;
             remain -= step;
-            dstOrigin += UNIT * step;
+            dstOrigin += pack * step;
         }
         
     };
@@ -488,7 +495,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
 
         auto weight    = mWinoResource->weight->host<int8_t>();
         std::vector<float> xkernelSum(DST_XUNIT, 0);
-        std::vector<float> wKernelSum(dc_4 * UNIT, 0);
+        std::vector<float> wKernelSum(dc_4 * pack, 0);
         std::vector<float> reluThred = {-std::numeric_limits<float>().max(), std::numeric_limits<float>().max()};
         
         auto tFunction = [&](int tId) {
@@ -505,20 +512,20 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
     #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                 src_trans_func(buffer2, srcOrigin, buffer0, xIndex, xC);
     #endif
-                ::memset(buffer1, 0, dc_4 * UNIT * sizeof(float));
+                ::memset(buffer1, 0, dc_4 * pack * sizeof(float));
                 // Multi
                 for (int i = 0; i < alpha2; ++i) {
                     auto _srcInt8Ptr = _srcOrigin + i * mTempInputBuffer->stride(1);
                     
-                    auto scaleVec = mWinoResource->transInputScales->host<float>() + i * UNIT;
+                    auto scaleVec = mWinoResource->transInputScales->host<float>() + i * pack;
                     int zeroPoint = mWinoResource->transInputZeroPoints[i];
-                    coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * UNIT, (UNIT == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, zeroPoint);
-                    if (UNIT != SRC_UNIT) {
+                    coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * pack, (pack == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, zeroPoint);
+                    if (pack != SRC_UNIT) {
                         int areaOffset[] = {DST_XUNIT, DST_XUNIT}, byte = sizeof(float);
-                        _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, UNIT / byte, SRC_UNIT / byte);
+                        _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, pack / byte, SRC_UNIT / byte);
                     }
                     
-                    auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * UNIT;
+                    auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * pack;
                     auto _weightInt8Ptr = weight + i * mWinoResource->weight->stride(0);
                     QuanPostTreatParameters quanParam;
                     quanParam.biasFloat = (mWinoResource->offsets->host<float>() + i * mWinoResource->offsets->stride(0));
@@ -526,16 +533,16 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                     quanParam.srcKernelSum = xkernelSum.data();
                     quanParam.weightQuanBias = wKernelSum.data();
                     quanParam.fp32minmax = reluThred.data();
-                    quanParam.scale = mWinoResource->scales->host<float>() + i * dc_4 * UNIT;
+                    quanParam.scale = mWinoResource->scales->host<float>() + i * dc_4 * pack;
                     quanParam.extraScale = nullptr;
-                    gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * UNIT * sizeof(float), dc_4, &quanParam, xC);
+                    gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, xC);
                 }
     #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                 {
                     auto midBuffer0 = buffer0;
                     auto midBuffer1 = (float*)((int8_t*)midBuffer0 + mTransformMidBuffer->stride(1));
-                    int srcZStep = xC * UNIT;
-                    int unitStep = dc_4 * xC * UNIT;
+                    int srcZStep = xC * pack;
+                    int unitStep = dc_4 * xC * pack;
                     int oybBegin = xIndex / wUnit;
                     int oxBegin = xIndex % wUnit;
                     int oybEnd = (xIndex + xC-1) / wUnit;
@@ -565,32 +572,32 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                         for (int si=0; si<step;) {
                             int sStep = (si == sBegin ? sEnd - sBegin : 1);
                             auto wIndex = si + oxBegin;
-                            auto srcXi = dstS + UNIT * si;
+                            auto srcXi = dstS + pack * si;
                             int dstX = wIndex * mUnitX;
-                            auto dstStart = dstOrigin + (dstX + dstY * ow + bIndex * ow * oh) * UNIT;
+                            auto dstStart = dstOrigin + (dstX + dstY * ow + bIndex * ow * oh) * pack;
                             int ex = ALIMIN(dstX + mUnitX, ow) - dstX;
-                            int count = ex * UNIT;
+                            int count = ex * pack;
                             
                             auto _dstStart = dstStart;
-                            int dstZStep = oh * ow * batch * UNIT, dstYStep = ow * UNIT;
+                            int dstZStep = oh * ow * batch * pack, dstYStep = ow * pack;
                             if (ex != mUnitX || (alphaX == 1 && ey != mUnitY)) {
-                                dstZStep = mUnitY * mUnitX * UNIT;
-                                dstYStep = mUnitX * UNIT;
+                                dstZStep = mUnitY * mUnitX * pack;
+                                dstYStep = mUnitX * pack;
                                 _dstStart = midBuffer1;
                             }
                             for (int z = 0; z < dc_4; ++z) {
                                 for (int x = 0; x < sStep; ++x) {
-                                    auto srcXiZ = srcXi + z * srcZStep + x * UNIT;
-                                    auto _dstStartZ = _dstStart + z * dstZStep + x * mUnitX * UNIT;
-                                    dstTransYFunc[alphaX](srcXiZ, midBuffer0, nullptr, nullptr, unitStep, UNIT, alphaX * unitStep, alphaX * UNIT);
-                                    dstTransXFunc[ey](midBuffer0, _dstStartZ, nullptr, nullptr, alphaX * UNIT, dstYStep, UNIT, UNIT);
+                                    auto srcXiZ = srcXi + z * srcZStep + x * pack;
+                                    auto _dstStartZ = _dstStart + z * dstZStep + x * mUnitX * pack;
+                                    dstTransYFunc[alphaX](srcXiZ, midBuffer0, nullptr, nullptr, unitStep, pack, alphaX * unitStep, alphaX * pack);
+                                    dstTransXFunc[ey](midBuffer0, _dstStartZ, nullptr, nullptr, alphaX * pack, dstYStep, pack, pack);
                                 }
                             }
                             if (ex != mUnitX || (alphaX == 1 && ey != mUnitY)) {
                                 for (int z = 0; z < dc_4; ++z) {
                                     for (int yy = 0; yy < ey; ++yy) {
-                                        auto srcYAddr = _dstStart + (z * mUnitY + yy) * mUnitX * UNIT;
-                                        auto dstYAddr = dstStart + z * ow * oh * batch * UNIT + yy * ow * UNIT;
+                                        auto srcYAddr = _dstStart + (z * mUnitY + yy) * mUnitX * pack;
+                                        auto dstYAddr = dstStart + z * ow * oh * batch * pack + yy * ow * pack;
                                         ::memcpy(dstYAddr, srcYAddr, count * sizeof(float));
                                     }
                                 }
@@ -599,7 +606,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                         }
                         oxBegin = 0;
                         remain -= step;
-                        dstS += UNIT * step;
+                        dstS += pack * step;
                     }
                 }
 #endif
diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
index 40a444696..738d85826 100644
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@@ -26,8 +26,9 @@
 namespace MNN {
 
 static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend,
-                              const Convolution2D* conv2d, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> weightQuantInfo, bool supportSparse, bool lowMemory) {
+                              const Op* op, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize, std::shared_ptr<ConvolutionCommon::Int8Common> weightQuantInfo, bool supportSparse, bool lowMemory) {
     auto cpuBackend = (CPUBackend*)backend;
+    auto conv2d = op->main_as_Convolution2D();
     auto common = conv2d->common();
 #ifdef MNN_USE_ONEDNN
     return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize);
@@ -47,9 +48,10 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
 
     if (lowMemory && nullptr != weightQuantInfo.get() && originWeightSize == 0) {
         if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
-            auto core = static_cast<CPUBackend*>(backend)->functions();
-            auto resourceInt8 = CPUConvolution::makeResourceInt8(backend, conv2d, core->pack);
-            return new DenseConvInt8TiledExecutor(backend, conv2d, resourceInt8, true);
+            // auto core = static_cast<CPUBackend*>(backend)->functions();
+            // auto resourceInt8 = CPUConvolution::makeResourceInt8(backend, op, core->pack);
+            // return new DenseConvInt8TiledExecutor(backend, op, resourceInt8, true);
+            return new DenseConvInt8TiledExecutor(backend, op, weightQuantInfo);
         } else {
             return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
         }
@@ -107,7 +109,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
             // The weight is storage as float sparse, but the backend don't support sparse compute, expand it
             forceFloat = true;
         }
-        quanCommon = ConvolutionCommon::load(conv2d, backend, forceFloat, lowMemory);
+        quanCommon = ConvolutionCommon::load(op, backend, forceFloat, lowMemory);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
             return nullptr;
@@ -143,7 +145,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
     }
     MNN_ASSERT(group > 0);
     if (1 == group) {
-        return _createUnit(inputs[0], outputs[0], backend, conv2d, originWeight, originWeightSize,
+        return _createUnit(inputs[0], outputs[0], backend, op, originWeight, originWeightSize,
                            originBias, originBiasSize, quanCommon, supportSparse, lowMemory);
     }
     // TODO: Use Geometry to split
@@ -157,7 +159,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
     emptyOutput->setLength(1, outputs[0]->channel() / group);
     for (int i = 0; i < group; ++i) {
         auto newConvolution =
-            _createUnit(emptyInput.get(), emptyOutput.get(), backend, conv2d, originWeight + groupWeightSize * i,
+            _createUnit(emptyInput.get(), emptyOutput.get(), backend, op, originWeight + groupWeightSize * i,
                         groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount, quanCommon, supportSparse, lowMemory);
         subConvolution.push_back(std::shared_ptr<Execution>(newConvolution));
     }
diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.cpp b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
index 74b78d28d..732b6540d 100644
--- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp
+++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp
@@ -177,7 +177,7 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op*
     int tempWeightSize   = 0;
     int srcCount = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, b, conv2D, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, b, convOp, &tempWeight, &tempWeightSize);
     srcCount = tempWeightSize / kx / ky / outputCount;
 
     int sy = common->strideY();
@@ -270,7 +270,7 @@ void DeconvolutionWithStride::_extract(const Op* convOp) {
     int tempWeightSize   = 0;
     int srcCount = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), convOp, &tempWeight, &tempWeightSize);
     srcCount = tempWeightSize / kx / ky / outputCount;
     
     std::shared_ptr<Tensor> weightWrap(
diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
index 61dfb445a..918f47fa1 100644
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@@ -29,36 +29,12 @@ void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source,
 }
 bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr<ConvolutionCommon::Int8Common> int8Info, std::shared_ptr<CPUConvolution::Resource> resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize, int bytes) {
     int weightLength = hU * lU * hP * lP;
-    resource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{hU, lU * lP, hP}));
-//    resource->mWeight.reset(Tensor::createDevice<uint8_t>({weightLength}));
-    auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
-    if (!res) {
-        return false;
-    }
     resource->mDequantize.bits = 8;
     resource->lU = lU;
     resource->hU = hU;
     resource->lP = lP;
     resource->hP = hP;
     MNN_ASSERT(lP == 1);
-    // Reorder weight
-
-    auto dstWInt8 = resource->mWeight->host<int8_t>();
-    auto srcWInt8 = int8Info->weight.get();
-    ::memset(dstWInt8, 0, resource->mWeight->usize());
-    for (int y=0; y<outputCount; ++y) {
-        int yo = y / hP;
-        int yi = y % hP;
-        auto srcY = srcWInt8 + y * srcChannel * kernelSize;
-        auto dstY = dstWInt8 + yo * lP * hP * lU + yi;
-        for (int iz=0; iz<srcChannel; ++iz) {
-            for (int k=0; k<kernelSize; ++k) {
-                int sx = iz * kernelSize + k;
-                int dx = iz + k * srcChannel;
-                dstY[dx * hP] = srcY[sx];
-            }
-        }
-    }
     // Save scale bias
     int dequantCnt = int8Info->alpha.size();
     int scaleSize = dequantCnt; // real size
@@ -69,7 +45,7 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr<Convolu
     int blockNum = scaleSize / outputCount;
     scaleSize = blockNum * hU * hP; // pack size
     resource->mDequantize.mScaleBias.reset(MNN::Tensor::createDevice<uint8_t>({scaleSize * 2 * bytes}));
-    res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
+    auto res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
     if (!res) {
         return false;
     }
@@ -78,22 +54,68 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr<Convolu
         MNN_ASSERT(weightLength % 2 == 0);
         weightLength = UP_DIV(weightLength, 2);
         resource->mDequantize.bits = 4;
-        std::shared_ptr<MNN::Tensor> weightLow(Tensor::createDevice<int8_t>(
-            {weightLength}));
-        auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
+        resource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{weightLength}));
+        auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
         if (!res) {
             return false;
         }
-        auto srcPtr = resource->mWeight->host<int8_t>();
-        auto dstPtr = weightLow->host<unsigned char>();
-        for (int i=0; i<weightLength; ++i) {
-            int s0 = srcPtr[2 * i + 0] + 8;
-            int s1 = srcPtr[2 * i + 1] + 8;
-            int d = s0 * 16 + s1;
-            dstPtr[i] = d;
+        auto dstWInt4 = resource->mWeight->host<uint8_t>();
+        auto srcWInt4 = int8Info->weight.get();
+        if (kernelSize == 1 && srcChannel % 2 == 0 && hU * hP == outputCount) {
+            for (int i = 0; i < hU; i++) {
+                for (int j = 0; j < srcChannel/2; j++) {
+                    for (int k = 0; k < hP/2; k++) {
+                        uint8_t s0 = srcWInt4[((i * hP + (k * 2 + 0)) * srcChannel) / 2 + j];
+                        uint8_t s1 = srcWInt4[((i * hP + (k * 2 + 1)) * srcChannel) / 2 + j];
+                        uint8_t d0 = (s0 & 0xf0) | (s1 >> 4);
+                        uint8_t d1 = (s0 << 4) | (s1 & 0x0f);
+                        dstWInt4[(i * srcChannel + (j * 2 + 0)) * hP / 2 + k] = d0;
+                        dstWInt4[(i * srcChannel + (j * 2 + 1)) * hP / 2 + k] = d1;
+                    }
+                }
+            }
+        } else {
+            // [oc, ic, ks] -> [oc/hP, ks, ic, hP]
+            ::memset(dstWInt4, 0, resource->mWeight->usize());
+            for (int y = 0; y < outputCount; ++y) {
+                int yo = y / hP;
+                int yi = y % hP;
+                for (int iz = 0; iz < srcChannel; ++iz) {
+                    for (int k=0; k < kernelSize; ++k) {
+                        int sx = y * srcChannel * kernelSize + iz * kernelSize + k;
+                        int dx = yo * lP * hP * lU + (iz + k * srcChannel) * hP + yi;
+                        uint8_t s = srcWInt4[sx/2];
+                        s = (sx % 2) ? (s & 0xf) : (s >> 4);
+                        s = (dx % 2) ? s : (s << 4);
+                        dstWInt4[dx/2] |= s;
+                    }
+                }
+            }
         }
         originOffset = -8;
-        resource->mWeight = weightLow;
+    } else {
+        resource->mWeight.reset(Tensor::createDevice<int8_t>(std::vector<int>{hU, lU * lP, hP}));
+        auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC);
+        if (!res) {
+            return false;
+        }
+        // Reorder weight for int8
+        auto dstWInt8 = resource->mWeight->host<int8_t>();
+        auto srcWInt8 = int8Info->weight.get();
+        ::memset(dstWInt8, 0, resource->mWeight->usize());
+        for (int y=0; y<outputCount; ++y) {
+            int yo = y / hP;
+            int yi = y % hP;
+            auto srcY = srcWInt8 + y * srcChannel * kernelSize;
+            auto dstY = dstWInt8 + yo * lP * hP * lU + yi;
+            for (int iz=0; iz<srcChannel; ++iz) {
+                for (int k=0; k<kernelSize; ++k) {
+                    int sx = iz * kernelSize + k;
+                    int dx = iz + k * srcChannel;
+                    dstY[dx * hP] = srcY[sx];
+                }
+            }
+        }
     }
     auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
     auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + scaleSize * bytes);
@@ -180,6 +202,9 @@ DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2D
         MNN_ASSERT(nullptr != int8Info.get());
         originWeightSize = int8Info->weight.size();
     }
+    if (int8Info && int8Info->canUseInt4) {
+        originWeightSize *= 2;
+    }
     // Don't use common->inputCount for old model common->inputCount is zero
     auto srcCount    = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY();
     auto lSize = srcCount * common->kernelX() * common->kernelY();
diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp
index bc5abc93b..a73afdba8 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.cpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.cpp
@@ -14,10 +14,10 @@
 
 namespace MNN {
 
-GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias):
-    CPUConvolution(conv2D->common(), bn), mResourceInt8(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
+GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Op *op, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias) :
+    CPUConvolution(op->main_as_Convolution2D()->common(), bn), mResourceInt8(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
         mResource.reset(new Resource);
-        CPUConvolution::makeResource(bn, mResource, conv2D, mResourceInt8);
+        CPUConvolution::makeResource(bn, mResource, op, mResourceInt8);
 }
 
 GemmInt8Executor::~GemmInt8Executor() {
@@ -39,8 +39,8 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
     auto output = outputs[0];
 
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
-    int UNIT___, SRC_UNIT, DST_XUNIT;
-    core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT);
+    int UNIT__, SRC_UNIT, DST_XUNIT;
+    core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
     auto gcore = static_cast<CPUBackend*>(backend())->functions();
     auto pack = gcore->pack;
 
@@ -81,19 +81,20 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
     mIm2ColParamter.kernelY         = 1;
     mIm2ColParamter.padX            = 0;
     mIm2ColParamter.padY            = 0;
-    mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT);
-    if (SRC_UNIT > UNIT___ && UNIT___ == pack) {
+    if (SRC_UNIT > pack) {
         const auto srcCountUnit = UP_DIV(input->channel(), pack);
+        mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / pack);
         mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
     } else {
         const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT);
-        mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack;
+        mIm2ColParamter.kernelCountUnit = srcCountUnit;
+        mIm2ColParamter.ic = srcCountUnit * SRC_UNIT;
     }
 
     mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT);
     const int threads = std::max(static_cast<CPUBackend*>(backend())->threadNumber(), 1);
     mThreadNums       = std::min(threads, mTileCnt);
-    
+
     mInputCol.reset(Tensor::createDevice<int8_t>({mThreadNums, DST_XUNIT,  mIm2ColParamter.kernelCountUnit * SRC_UNIT}));
     bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC);
     if (!success) {
@@ -137,7 +138,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
 
     auto im2colPtr           = mInputCol->host<int8_t>();
     auto outputDataPtr       = output->host<float>();
-    
+
     auto bias_elesize = ocDiv4 * PackUnit;
     QuanPostTreatParameters quanParam;
     quanParam.scale = mScaleData.data();
@@ -156,7 +157,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
     quanParam.weightQuanBias = mKernelSum.data();
     quanParam.extraScale = nullptr;
     float dequantScale = mMutableResource.mResource->mInputScale;
-    
+
     SumByAxisParams sumParams;
     sumParams.DST_XUNIT = DST_XUNIT;
     sumParams.SRC_UNIT = SRC_UNIT;
@@ -210,7 +211,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
         threadFunction((int)tId);
     }
     MNN_CONCURRENCY_END();
-    
+
     // MNN_PRINT("deconv int8 execute: cost time: %llu us\n", kernelTimer.durationInUs());
     return NO_ERROR;
 }
diff --git a/source/backend/cpu/compute/GemmInt8Executor.hpp b/source/backend/cpu/compute/GemmInt8Executor.hpp
index 0c1345f03..a93f90435 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.hpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.hpp
@@ -14,7 +14,7 @@
 namespace MNN {
 class GemmInt8Executor : public CPUConvolution {
 public:
-    GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel), 
+    GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Op *op, decltype(CoreInt8Functions::Int8GemmKernel),
                      std::vector<int32_t> bias);
     virtual ~GemmInt8Executor();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/compute/IdstConvolutionInt8.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
index 025ed8763..05a9df338 100644
--- a/source/backend/cpu/compute/IdstConvolutionInt8.cpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@@ -65,9 +65,9 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back
     auto kernelCount        = kx * ky;
     auto srcCount           = mSrcCount;
     std::vector<int> shape;
-    if (SRC_UNIT > UNIT && UNIT == PackUnit) {
+    if (SRC_UNIT > PackUnit) {
         MNN_ASSERT(SRC_UNIT % UNIT == 0);
-        shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT};
+        shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, PackUnit) * kernelCount, SRC_UNIT / PackUnit), UNIT, SRC_UNIT};
     } else {
         shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
     }
diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
index 5c8fc0dca..ed5364c10 100644
--- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp
@@ -64,8 +64,9 @@ bool SparseConvInt8TiledExecutor::reorderWeight(Backend* b, const Convolution2DC
     return true;
 }
 
-SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, convOp, res) {
+SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
 
+    auto convOp = op->main_as_Convolution2D();
     std::shared_ptr<Tensor> weightOrigin;
     weightOrigin.swap(mResourceInt8->mWeightInt8);
     const SparseCommon* sparseCommon = convOp->sparseParameter();
@@ -81,9 +82,9 @@ SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const
 
 }
 
-SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp,
+SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Op* op,
                                                          const SparseConvInt8TiledExecutor& exe)
-    : ConvInt8TiledExecutor(backend, convOp, exe.mResourceInt8),
+    : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8),
       mNNZMap(exe.mNNZMap),
       mDataOffsetMap(exe.mDataOffsetMap),
       mSparseBlockOC(exe.mSparseBlockOC),
@@ -98,7 +99,7 @@ bool SparseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution**
     if (nullptr == dst) {
         return true;
     }
-    auto exe = new SparseConvInt8TiledExecutor(bn, op->main_as_Convolution2D(), *this);
+    auto exe = new SparseConvInt8TiledExecutor(bn, op, *this);
     if (!exe->valid()) {
         return false;
     }
@@ -176,13 +177,13 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
     auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
     auto outputDataPtr       = output->host<int8_t>();
     QuanPostTreatParameters quanParam;
-    quanParam.bias = mMutableResource.mBiasInt32->host<int32_t>();
-    quanParam.scale = mMutableResource.mScaleFloat->host<float>();
-    quanParam.maxValue = mMutableResource.mClampMax;
+    quanParam.bias = mMutableResource->mBiasInt32->host<int32_t>();
+    quanParam.scale = mMutableResource->mScaleFloat->host<float>();
+    quanParam.maxValue = mMutableResource->mClampMax;
     if (mResourceInt8->mRelu) {
-        quanParam.minValue = mMutableResource.mOutputZeroPoint;
+        quanParam.minValue = mMutableResource->mOutputZeroPoint;
     } else {
-        quanParam.minValue = mMutableResource.mClampMin;
+        quanParam.minValue = mMutableResource->mClampMin;
     }
     // MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount);
     const int col_buffer_size = mTempIm2ColBuffer->stride(0);
@@ -207,9 +208,9 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
             bool needZero = res.second;
             if (needZero) {
 #ifdef MNN_USE_SSE
-                ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size);
+                ::memset(colAddr, mMutableResource->mInputZeroPoint + 128, col_buffer_size);
 #else
-                ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size);
+                ::memset(colAddr, mMutableResource->mInputZeroPoint, col_buffer_size);
 #endif
             }
             info[0] = number;
diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp
index 9bcb7ee61..3f57e4454 100644
--- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp
@@ -31,7 +31,7 @@ struct SparseQuantMatMulParam {
 class SparseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
 public:
     // given weight+bias+scale, do post process
-    SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr<ResourceInt8> res);
+    SparseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res);
     virtual ~SparseConvInt8TiledExecutor();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@@ -50,7 +50,7 @@ class SparseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     }
 
 private:
-    SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, const SparseConvInt8TiledExecutor& exe);
+    SparseConvInt8TiledExecutor(Backend* backend, const Op* op, const SparseConvInt8TiledExecutor& exe);
 
     SparseQuantMatMulParam mSparseQuantParam;
     decltype(CoreInt8Functions::MNNPackedSparseQuantMatMulEpx1) mSparseQuantMatMulKernel;
diff --git a/source/backend/cuda/execution/ConvCutlassExecution.cu b/source/backend/cuda/execution/ConvCutlassExecution.cu
index 28f455a55..b8a960314 100644
--- a/source/backend/cuda/execution/ConvCutlassExecution.cu
+++ b/source/backend/cuda/execution/ConvCutlassExecution.cu
@@ -26,7 +26,7 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize);
     auto oc = common->outputCount();
 
     int l = weightSize / oc;
@@ -195,7 +195,7 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
     // Call from different function
     if(mFp32Infer){
         return callCutlassGemmCudaCoreFloat32(inputs, outputs);
-    } 
+    }
 
     mGpuComputeCap = runtime->compute_capability();
     //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap);
@@ -211,10 +211,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
         // 0 -> Gemm, 1~N -> BatchGemm
         int32_t batchSize = 0;
         // [0]->A, [1]->B, [2]->bias, [3]->output
-        std::pair<void *, int32_t> ptrOffset[4]; 
+        std::pair<void *, int32_t> ptrOffset[4];
         int32_t batchOffset[4];
         // [0]->alpha, [1]->beta, [2]->splitK
-        int32_t coefs[3]; 
+        int32_t coefs[3];
         // 0 -> RowColumn, 1 -> RowRow
         int32_t layout;
         bool epilogueVectorize
@@ -246,7 +246,7 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
         return NO_ERROR;
     }
     #endif
-    
+
     return callCutlassGemmTensorCore(inputs, outputs);
 }
 
diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.cu b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
index f4baac9d5..d53f67972 100755
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
@@ -40,7 +40,7 @@ __global__ void CONV_DW(const T* input,
         d_oc.divmod(index, tmp1, oz_2);
         d_ow.divmod(tmp1, tmp2, ox);
         d_oh.divmod(tmp2, ob, oy);
-        
+
         int oz = oz_2 << 1;
         int ix = ox * sw - pw;
         int iy = oy * sh - ph;
@@ -80,10 +80,10 @@ __global__ void CONV_DW(const T* input,
     }
 }
 
-__global__ void CONV_DW_HALF2_OPT(const half2* input, 
-    const half2* kernel, 
-    const half2* bias, 
-    half2 *output, 
+__global__ void CONV_DW_HALF2_OPT(const half2* input,
+    const half2* kernel,
+    const half2* bias,
+    half2 *output,
     const float maxV,
     const float minV,
     const int iw,
@@ -111,7 +111,7 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input,
         d_oc.divmod(index, tmp1, oz_2);
         d_ow.divmod(tmp1, tmp2, ox);
         d_oh.divmod(tmp2, ob, oy);
-        
+
         int oz = oz_2;
         int ix = ox * sw - pw;
         int iy = oy * sh - ph;
@@ -144,10 +144,10 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input,
     }
 }
 
-__global__ void CONV_DW3x3_HALF2_OPT(const half2* input, 
-    const half2* kernel, 
-    const half2* bias, 
-    half2 *output, 
+__global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
+    const half2* kernel,
+    const half2* bias,
+    half2 *output,
     const float maxV,
     const float minV,
     const int iw,
@@ -175,7 +175,7 @@ __global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
         d_oc.divmod(index, tmp1, oz_2);
         d_ow.divmod(tmp1, tmp2, ox_2);
         d_oh.divmod(tmp2, ob, oy);
-        
+
         int oz = oz_2;
         int ox = ox_2 << 1;
         int ix = ox - 1;
@@ -348,7 +348,7 @@ __global__ void CONV_DW_MULTI_WIDTH4(const T* input, const half* kernel, const h
         float color3 = color0;
 
         // Parallel pipelining read and calculate
-        float src; 
+        float src;
         float filter0, filter1, filter2, filter3;
         int src_offset = ((ob * ih + oy) * iw + (ox_4 << 2)) * c_p + oz;
         int filter_offset = 0 * c_p + oz;
@@ -450,7 +450,7 @@ __global__ void CONV_DW_MULTI_WIDTH_CHANNEL(const float* input, const half* kern
 
         float2 src    = ((float2 *)(input + src_offset + 0 * c_p))[0];
         float2 filter = __half22float2(((half2 *)(kernel + filter_offset + 0 * c_p))[0]);
-        
+
         color0.x += (src.x * filter.x);
         color0.y += (src.y * filter.y);
 
@@ -589,9 +589,9 @@ ErrorCode ConvDepthWiseCompute(Backend* bn,
         return NO_ERROR;
     }
 
-    if(dw == 1 && dh == 1) { 
+    if(dw == 1 && dh == 1) {
         if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0) {
-            
+
             if(ow % 4 == 0) {
                 DivModFast d_oc(c * PACK_NUMBER);
                 DivModFast d_ow(ow/4);
@@ -655,7 +655,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize);
     auto tempWeightStorage = pool->alloc(depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float));
     auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
     cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
@@ -666,7 +666,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
     auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
     auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
     auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
-    
+
     #ifdef ENABLE_CUDA_BF16
     if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) {
         // [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)]
@@ -677,7 +677,7 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
         WeightTransToBf16<<<block_num, threads_num>>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\
             kernelY * kernelX, depth, d_ocp);
         checkKernelErrors;
-    } 
+    }
     else
     #endif
     {
@@ -717,15 +717,15 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
         cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
 
         #ifdef ENABLE_CUDA_BF16
-        if(static_cast<CUDABackend*>(bn)->getPrecision() == 3) 
+        if(static_cast<CUDABackend*>(bn)->getPrecision() == 3)
         {
             auto countBias = depthC * PACK_NUMBER;
             int block_num = runtime->blocks_num(countBias);
             int threads_num = runtime->threads_num();
             BiasTransToBf16<<<block_num, threads_num>>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth);
             checkKernelErrors;
-        } 
-        else 
+        }
+        else
         #endif
         {
             reg.size[0] = 1;
diff --git a/source/backend/cuda/execution/ConvImplicitExecution.cu b/source/backend/cuda/execution/ConvImplicitExecution.cu
index 58cda9dd6..d5353499e 100644
--- a/source/backend/cuda/execution/ConvImplicitExecution.cu
+++ b/source/backend/cuda/execution/ConvImplicitExecution.cu
@@ -82,7 +82,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);
     mKernelInfo.kernelN = common->outputCount();
     mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
 
@@ -93,7 +93,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
         int ci_pack = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER;
         int co_pack = UP_DIV(mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER;
         int khw = mKernelInfo.kernelX * mKernelInfo.kernelY;
-    
+
         auto tempCacheBuffer = static_cast<CUDABackend*>(backend)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
         float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
         runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
@@ -108,16 +108,16 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
 
         DivModFast cipD(ci_pack);
         DivModFast khwD(khw);
-    
+
         int block_num = runtime->blocks_num(ci_pack * co_pack * khw);
         int block_size = runtime->threads_num();
-    
+
         if(static_cast<CUDABackend*>(backend)->getPrecision() == 1) {
             WeightPackFill_Implicit<<<block_num, block_size>>>((const float*)cacheWeight, (float*)mFilter, khw, ci_pack * co_pack * khw, mKernelInfo.kernelC, mKernelInfo.kernelN, cipD, khwD);
             checkKernelErrors;
         } else {
             WeightPackFill_Implicit<<<block_num, block_size>>>((const float*)cacheWeight, (half*)mFilter, khw, ci_pack * co_pack * khw, mKernelInfo.kernelC, mKernelInfo.kernelN, cipD, khwD);
-            checkKernelErrors;            
+            checkKernelErrors;
         }
         static_cast<CUDABackend*>(backend)->getStaticBufferPool()->free(tempCacheBuffer);
     }
@@ -142,7 +142,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
         int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER;
         biasTensor.reset(Tensor::createDevice<uint32_t>({alignSize}));
         backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-    
+
         mBias = (void *)biasTensor.get()->buffer().device;
         cuda_check(cudaMemset(mBias, 0, alignSize*sizeof(float)));
         cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
@@ -159,7 +159,7 @@ ConvImplicitExecution::ConvImplicitExecution(Backend* backend, const MNN::Op* op
     #else
     Execution(backend),
     #endif
-    mOp(op) 
+    mOp(op)
 {
     mResource = res;
     int precisonLevel = static_cast<CUDABackend*>(backend)->getPrecision();
@@ -206,7 +206,7 @@ ErrorCode ConvImplicitExecution::onResize(const std::vector<Tensor*>  &inputs, c
     // Split K dimension into 1 partitions
     int split_k_slices = 1;
     int ci_pack = UP_DIV(input->channel(), PACK_NUMBER) * PACK_NUMBER;
-    int co_pack = UP_DIV(output->channel(), PACK_NUMBER) * PACK_NUMBER;  
+    int co_pack = UP_DIV(output->channel(), PACK_NUMBER) * PACK_NUMBER;
     // Construct Conv2dProblemSize with user defined output size
     cutlass::conv::Conv2dProblemSize problem_size(
         input->batch(),//int N,
@@ -253,7 +253,7 @@ ErrorCode ConvImplicitExecution::onResize(const std::vector<Tensor*>  &inputs, c
         mWorkspace = (void *)workspaceTensor.get()->buffer().device;
     }
 
-    // Check the problem size is supported or not 
+    // Check the problem size is supported or not
     cutlass::Status status = mImplicitConvOp.can_implement(arguments);
     cutlass_check(status);
 
diff --git a/source/backend/cuda/execution/ConvWinogradExecution.cu b/source/backend/cuda/execution/ConvWinogradExecution.cu
index 249720d10..fab1670db 100644
--- a/source/backend/cuda/execution/ConvWinogradExecution.cu
+++ b/source/backend/cuda/execution/ConvWinogradExecution.cu
@@ -14,7 +14,7 @@ namespace CUDA {
 
 #define UNIT 2
 template<typename T>
-__global__ void WinoWeightReorder(const float* GgGt, 
+__global__ void WinoWeightReorder(const float* GgGt,
     T* GgGt_trans,
     const int block,
     const int co_pack,
@@ -67,7 +67,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);
     mKernelInfo.kernelN = common->outputCount();
     mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
 
@@ -110,7 +110,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
         }
         static_cast<CUDABackend*>(backend)->getStaticBufferPool()->free(tempCacheBuffer);
     }
-    
+
     // Copy Bias
     int biasSize = conv->bias()->size();
     int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER;
@@ -133,7 +133,7 @@ ConvWinogradExecution::ConvWinogradExecution(Backend* backend, const MNN::Op* op
     #else
     Execution(backend),
     #endif
-    mOp(op) 
+    mOp(op)
 {
     mResource = res;
     int precisonLevel = static_cast<CUDABackend*>(backend)->getPrecision();
@@ -197,10 +197,10 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
     }
     auto bufferData = pool->alloc(BtdB_bytes * mBlock2 * mGemmInfo.elhPad[0] * mGemmInfo.elhPad[1]);
     mBtdB_Buffer = (void*)((uint8_t*)bufferData.first + bufferData.second);
-    
+
     auto bufferMatmul = pool->alloc(bytes * mBlock2 * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
     mMatmul_Buffer = (void*)((uint8_t*)bufferMatmul.first + bufferMatmul.second);
-    
+
     pool->free(bufferData);
     pool->free(bufferMatmul);
 
@@ -231,7 +231,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
             mWorkspace = (void *)workspaceTensor.get()->buffer().device;
         }
 
-        // Check the problem size is supported or not 
+        // Check the problem size is supported or not
         cutlass::Status status = mGemmBatchedCudaF32F32Ln.can_implement(arguments);
         cutlass_check(status);
 
@@ -258,24 +258,24 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
                                                 (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]),  // batch_stride_C
                                                 {alpha, beta},          // <- tuple of alpha and beta
                                                 mBlock2};                // batch_count
-    
+
             size_t workspace_size = GemmBatchedCuda_F16_F16_Linear_AlignCuda_Row_Column::get_workspace_size(arguments);
-    
+
             if(workspace_size != 0) {
                 workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
                 mResource->mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
                 mWorkspace = (void *)workspaceTensor.get()->buffer().device;
             }
-    
-            // Check the problem size is supported or not 
+
+            // Check the problem size is supported or not
             cutlass::Status status = mGemmBatchedCudaF16F16Ln.can_implement(arguments);
             cutlass_check(status);
-    
+
             // Initialize CUTLASS kernel with arguments and workspace pointer
             status = mGemmBatchedCudaF16F16Ln.initialize(arguments, (uint8_t *)mWorkspace);
             cutlass_check(status);
         } else {
-    
+
             typename GemmBatchedCuda_F16_F32_Linear_AlignCuda_Row_Column::Arguments arguments{problem_size,  // <- problem size of matrix multiplication
                                                 {(ElementInput_F16 *)mBtdB_Buffer, mGemmInfo.elhPad[1]},  // Ptr + ldm
                                                 (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
@@ -287,24 +287,24 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
                                                 (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]),  // batch_stride_C
                                                 {alpha, beta},          // <- tuple of alpha and beta
                                                 mBlock2};                // batch_count
-    
+
             size_t workspace_size = GemmBatchedCuda_F16_F32_Linear_AlignCuda_Row_Column::get_workspace_size(arguments);
-    
+
             if(workspace_size != 0) {
                 workspaceTensor.reset(Tensor::createDevice<int8_t>({(int)workspace_size}));
                 mResource->mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
                 mWorkspace = (void *)workspaceTensor.get()->buffer().device;
             }
-    
-            // Check the problem size is supported or not 
+
+            // Check the problem size is supported or not
             cutlass::Status status = mGemmBatchedCudaF16F32Ln.can_implement(arguments);
             cutlass_check(status);
-    
+
             // Initialize CUTLASS kernel with arguments and workspace pointer
             status = mGemmBatchedCudaF16F32Ln.initialize(arguments, (uint8_t *)mWorkspace);
             cutlass_check(status);
         }
-    
+
         return NO_ERROR;
     }
     //MNN_PRINT("Winograd BatchGemm batch:%d, MNK:%d-%d-%d\n", mBlock2, mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);
@@ -316,10 +316,10 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
             // 0 -> Gemm, 1~N -> BatchGemm
             int32_t batchSize = 0;
             // [0]->A, [1]->B, [2]->bias, [3]->output
-            std::pair<void *, int32_t> ptrOffset[4]; 
+            std::pair<void *, int32_t> ptrOffset[4];
             int32_t batchOffset[4];
             // [0]->alpha, [1]->beta, [2]->splitK
-            int32_t coefs[3]; 
+            int32_t coefs[3];
             // 0 -> RowColumn, 1 -> RowRow
             int32_t layout;
             bool epilogueVectorize
@@ -374,7 +374,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
                 mWorkspace = (void *)workspaceTensor.get()->buffer().device;
             }
 
-            // Check the problem size is supported or not 
+            // Check the problem size is supported or not
             cutlass::Status status = mGemmBatchedF16F16LnSm75.can_implement(arguments);
             cutlass_check(status);
 
@@ -404,7 +404,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*>  &inputs, c
             mWorkspace = (void *)workspaceTensor.get()->buffer().device;
         }
 
-        // Check the problem size is supported or not 
+        // Check the problem size is supported or not
         cutlass::Status status = mGemmBatchedF16F32LnSm75.can_implement(arguments);
         cutlass_check(status);
 
@@ -446,19 +446,19 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
     int block_size = runtime->threads_num();
     if(mFp32Infer) {
         WinoInputTrans<<<block_num, block_size>>>((const float*)input_addr, (float*)mBtdB_Buffer, UNIT,
-                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, 
+                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
                 total, lD, whD, wD,
                 mPadX, mPadY, input->width(), input->height());
         checkKernelErrors;
     } else if(mFp16Fp32MixInfer) {
         WinoInputTrans<<<block_num, block_size>>>((const float*)input_addr, (half*)mBtdB_Buffer, UNIT,
-                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, 
+                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
                 total, lD, whD, wD,
                 mPadX, mPadY, input->width(), input->height());
         checkKernelErrors;
     } else {
         WinoInputTrans<<<block_num, block_size>>>((const half*)input_addr, (half*)mBtdB_Buffer, UNIT,
-                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, 
+                (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
                 total, lD, whD, wD,
                 mPadX, mPadY, input->width(), input->height());
         checkKernelErrors;
@@ -487,7 +487,7 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
                 if(mIsTuned) {
                     runGemmBatchedTensorCoreFloat16Infer(&mInfo);
                 }
-                #endif 
+                #endif
                 if(!mIsTuned) {
                     cutlass::Status status = mGemmBatchedF16F16LnSm75();
                     cutlass_check(status);
@@ -500,14 +500,14 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
     block_size = runtime->threads_num();
     if (mFp16Fp32MixInfer || mFp32Infer) {
         WinoTrans2Output<<<block_num, block_size>>>((const float*)mMatmul_Buffer, (const float*)bias_addr, (float*)output_addr,
-                UNIT, mBlock2, output->channel(), co_pack, 
+                UNIT, mBlock2, output->channel(), co_pack,
                 count, hD, whD, wD,
                 output->width(), output->height(),
                 mActivationType);
         checkKernelErrors;
     } else {
         WinoTrans2Output<<<block_num, block_size>>>((const half*)mMatmul_Buffer, (const float*)bias_addr, (half*)output_addr,
-                UNIT, mBlock2, output->channel(), co_pack, 
+                UNIT, mBlock2, output->channel(), co_pack,
                 count, hD, whD, wD,
                 output->width(), output->height(),
                 mActivationType);
diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.cu b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
index 1c9968c49..5bd945e06 100644
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
@@ -17,7 +17,7 @@ namespace CUDA {
 DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     mBackend = bn;
     auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
-    
+
     auto conv       = op->main_as_Convolution2D();
     auto common     = conv->common();
     mKernelInfo.kernelX        = common->kernelX();
@@ -33,7 +33,7 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize);
     mKernelInfo.kernelN = common->outputCount();
     mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
 
@@ -49,7 +49,7 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
     float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
     runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
-    
+
     // Reorder weight
     if(static_cast<CUDABackend*>(bn)->getPrecision() == 1) {
         weightTensor.reset(Tensor::createDevice<int32_t>({param.elhPad[1] * param.elh[2]}));
@@ -57,8 +57,8 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
         weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPad[1] * param.elh[2]}));
     }
     bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
-    mFilter = (void *)weightTensor.get()->buffer().device;    
-    
+    mFilter = (void *)weightTensor.get()->buffer().device;
+
     callWeightReorder((const void *)cacheWeight, (void *)mFilter, mKernelInfo, param.elhPad[1], (int)(static_cast<CUDABackend*>(bn)->getPrecision() == 1), runtime);
 
     static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
@@ -184,12 +184,12 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &input
     mFilterAddr = mResource->mFilter;
     mBiasAddr   = mResource->mBias;
     mBackendPtr = mResource->mBackend;
- 
+
     // Call from different function
     if(mFp32Infer){
         return callCutlassGemmCudaCoreFloat32(inputs, outputs);
-    } 
- 
+    }
+
     mGpuComputeCap = runtime->compute_capability();
     //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap);
     if(mGpuComputeCap < 75) {
@@ -214,7 +214,7 @@ ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inpu
     if(mFp16Fp32MixInfer) {
         size_t maxCount = mGemmInfo.elh[0] * mGemmInfo.elhPad[1];
         callFloat2Half((const void*)input_addr, (void*)mInputBuffer, maxCount, runtime);
-    } 
+    }
 
     // Run cutlass gemm forward
     runCutlassGemmFunc();
@@ -231,7 +231,7 @@ ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inpu
 
 class CUDADeconvolutionCreator : public CUDABackend::Creator {
 public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, 
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
             const MNN::Op* op, Backend* backend) const override {
         if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
             auto quan = op->main_as_Convolution2D()->quanParameter();
diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
index d2d1adaf2..c72651e15 100644
--- a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
+++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu
@@ -26,7 +26,7 @@ ConvCutlassBf16Execution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize);
     auto oc = common->outputCount();
 
     int l = weightSize / oc;
diff --git a/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu b/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu
index a9851bb53..406add181 100644
--- a/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu
+++ b/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu
@@ -52,7 +52,7 @@ __global__ void CONV_FpAInt8B(const T* input,
         d_oc.divmod(index, tmp1, oz_2);
         d_ow.divmod(tmp1, tmp2, ox);
         d_oh.divmod(tmp2, ob, oy);
-        
+
         int oz = oz_2;
         int ix = ox * sw - pw;
         int iy = oy * sh - ph;
@@ -124,7 +124,7 @@ __global__ void CONV_FpAInt4B(const T* input,
         d_oc.divmod(index, tmp1, oz_2);
         d_ow.divmod(tmp1, tmp2, ox);
         d_oh.divmod(tmp2, ob, oy);
-        
+
         int oz = oz_2;
         int ix = ox * sw - pw;
         int iy = oy * sh - ph;
@@ -215,7 +215,7 @@ bool ConvFpAIntBExecution::isValid(const Convolution2D* conv, Backend* backend)
     return true;
 }
 
-    
+
 ConvFpAIntBExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     mBackend = bn;
     auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
@@ -224,7 +224,7 @@ ConvFpAIntBExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     auto common     = conv->common();
 
     //weight host->device
-    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon = ConvolutionCommon::load(conv, mBackend, false, true);
+    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon = ConvolutionCommon::load(op, mBackend, false, true);
 
     auto oc = common->outputCount();
     auto weightSize = quanCommon->weight.size();
@@ -481,7 +481,7 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector<Tensor*> &inputs, co
         maxV = 6.0f;
     }
 
-    auto total = outputs[0]->batch() * oh * ow * ocp; 
+    auto total = outputs[0]->batch() * oh * ow * ocp;
     auto& prop = runtime->prop();
     int limitThreads = UP_DIV(total, prop.multiProcessorCount);
     int threadNum = ALIMIN(prop.maxThreadsPerBlock/2, limitThreads);
@@ -503,9 +503,9 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector<Tensor*> &inputs, co
                 (const float*)mResource->mScale,  (const float*)mResource->mOffset, (const float*)bias_addr, (float*)output_addr,
                 maxV, minV, ic, icp, iw, ih, oc, ocp, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
                 d_oc, d_ow, d_oh);
-            checkKernelErrors;  
+            checkKernelErrors;
         }
-        
+
         return NO_ERROR;
     }
 
@@ -520,9 +520,9 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector<Tensor*> &inputs, co
             (const float*)mResource->mScale,  (const float*)mResource->mOffset, (const float*)bias_addr, (float*)output_addr,
             maxV, minV, ic, icp, iw, ih, oc, ocp, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total,
             d_oc, d_ow, d_oh);
-        checkKernelErrors;  
+        checkKernelErrors;
     }
-    
+
     return NO_ERROR;
 }
 
diff --git a/source/backend/hiai/execution/NPUConvolution.cpp b/source/backend/hiai/execution/NPUConvolution.cpp
index 40832368f..bb939dd23 100644
--- a/source/backend/hiai/execution/NPUConvolution.cpp
+++ b/source/backend/hiai/execution/NPUConvolution.cpp
@@ -49,7 +49,7 @@ ErrorCode NPUConvolution::onResize(const std::vector<Tensor *> &inputs, const st
 
     std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
     if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
+        quanCommon = ConvolutionCommon::load(mOp, backend(), true);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
         }
diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
index 059165ef0..e856c1702 100644
--- a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
+++ b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp
@@ -49,7 +49,7 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,
     }
     std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
     if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
+        quanCommon = ConvolutionCommon::load(mOp, backend(), true);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
         }
@@ -71,7 +71,7 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,
     shared_ptr<hiai::op::ConvolutionDepthwise> conv(new hiai::op::ConvolutionDepthwise(opName));
 
     auto xOp = mNpuBackend->getInputOps(mOp);
-    
+
     // om input weight const op
     mConst_w = hiai::op::Const(opName + "_w_const");
     {
diff --git a/source/backend/metal/MetalConvolution.mm b/source/backend/metal/MetalConvolution.mm
index c1b6ee37c..d9dabb4bf 100755
--- a/source/backend/metal/MetalConvolution.mm
+++ b/source/backend/metal/MetalConvolution.mm
@@ -17,7 +17,7 @@
 namespace MNN {
 
 MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) {
-    loadWeight(op->main_as_Convolution2D());
+    loadWeight(op);
 }
 MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias) : MetalConvolutionCommon(backend, op, bias) {
     mWeight = weight;
@@ -47,7 +47,7 @@
     auto oh   = output->height();
     auto oc_4 = UP_DIV(output->channel(), 4);
     auto ob   = output->batch();
-    
+
     auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
     auto padX = pads.first;
     auto padY = pads.second;
@@ -77,7 +77,7 @@
                        mActivationType};
     mConstBuffer = backend->getConstBuffer(sizeof(constants));
     ::memcpy(mConstBuffer.contents, constants, sizeof(constants));
-    
+
     mParam = "_ic" + std::to_string(ic_4) + "oc" + std::to_string(oc_4) +
              "k" + std::to_string(mKernelX) + "x" + std::to_string(mKernelY) +
              "s" + std::to_string(mStrideX) + "x" + std::to_string(mStrideY) +
@@ -119,7 +119,7 @@
         int itemW[total_kernel] = {1, 1, 1, 2, 4};
         int itemH[total_kernel] = {1, 1, 1, 1, 1};
         int itemC[total_kernel] = {1, 4, 2, 1, 1};
-        
+
         int actual_kernel = 3;
         if(isS1D1P0) {
             actual_kernel = 4;
@@ -137,7 +137,7 @@
         }
 
         std::pair<NSUInteger, int> min_cost(INT_MAX, 0);//(min_time, min_index)
-        
+
         NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                         (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                         mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
@@ -159,7 +159,7 @@
 
             std::string name = [shaderName[knl_idx] UTF8String] + mParam;
             auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets: buffer_offset queue:backend->queue()];
-            
+
             if(min_cost.first > std::get<2>(ret)) {
                 min_cost.first = std::get<2>(ret);
                 min_cost.second = knl_idx;
@@ -178,7 +178,7 @@
 void MetalConvolution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
     auto input = inputs[0];
     auto output = outputs[0];
-    
+
     [encoder setComputePipelineState:mPipeline];
     MetalBackend::setTensor(input, encoder, 0);
     MetalBackend::setTensor(output, encoder, 1);
diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm
index cfd9f47b6..33a3eb19d 100644
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@@ -25,10 +25,10 @@
 MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) {
     auto conv2D = op->main_as_Convolution2D();
     bool ldInt8Weight = false;
-    if (conv2D->quanParameter() && conv2D->quanParameter()->buffer()) {
+    if (conv2D->quanParameter() && (conv2D->external() || conv2D->quanParameter()->buffer())) {
         ldInt8Weight = true;
     }
-    loadWeight(op->main_as_Convolution2D(), ldInt8Weight);
+    loadWeight(op, ldInt8Weight);
 }
 
 MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr<MNN::Tensor> weight, std::shared_ptr<MNN::Tensor> bias, std::shared_ptr<MNN::Tensor> dequantScale, int dequantBits) : MetalConvolutionCommon(backend, op, bias) {
@@ -78,7 +78,7 @@
     int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, blockSize, mActivationType};
     mConstBuffer = backend->getConstBuffer(sizeof(constants));
     ::memcpy(mConstBuffer.contents, constants, sizeof(constants));
-    
+
     MetalRuntime* rt = (MetalRuntime *)backend->runtime();
     if (mDequantScaleBias.get()) {
         NSUInteger gid_x = UP_DIV(ow * oh, 4);
@@ -106,7 +106,7 @@
             TensorUtils::getDescribe(bias)->extra.offset,
             TensorUtils::getDescribe(mDequantScaleBias.get())->extra.offset,
             0};
-        
+
         MetalRuntime *rt = (MetalRuntime *)backend->runtime();
         auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset  queue:backend->queue()];
         mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret));
@@ -123,7 +123,7 @@
             NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                             (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                             mConstBuffer, (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
-            
+
             const Tensor* weight = mWeight.get();
             const Tensor* bias = mBias.get();
             int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset, 0};
@@ -135,9 +135,9 @@
             NSUInteger gid_x = UP_DIV(ow * oh, 4);
             NSUInteger gid_y = oc_4;
             NSUInteger gid_z = ob;
-            
+
             mPipeline = [context pipelineWithName:@"conv1x1_g1z4" fp16:backend->useFp16InsteadFp32()];
-            
+
             NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                             (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                             mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
@@ -160,23 +160,23 @@
             actual_kernel = 3;
         }
         std::pair<NSUInteger, int> min_cost(INT_MAX, 0);//(min_time, min_index)
-        
+
         NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                         (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                         mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
         const Tensor* weight = mWeight.get();
         const Tensor* bias = mBias.get();
         int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset, 0};
-        
+
         for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
             id<MTLComputePipelineState> pipeline = [context pipelineWithName:shaderName[knl_idx] fp16:backend->useFp16InsteadFp32()];
             NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]);
             NSUInteger gid_y = UP_DIV(oc, itemC[knl_idx]);
             NSUInteger gid_z = 1;
-            
+
             std::string name = [shaderName[knl_idx] UTF8String];
             auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset queue:backend->queue()];
-            
+
             if(min_cost.first > std::get<2>(ret)) {
                 min_cost.first = std::get<2>(ret);
                 min_cost.second = knl_idx;
diff --git a/source/backend/metal/MetalConvolutionCommon.hpp b/source/backend/metal/MetalConvolutionCommon.hpp
index 299551922..a391d65e2 100644
--- a/source/backend/metal/MetalConvolutionCommon.hpp
+++ b/source/backend/metal/MetalConvolutionCommon.hpp
@@ -22,7 +22,7 @@ class MetalConvolutionCommon : public MetalExecution {
     virtual ~MetalConvolutionCommon() = default;
 
 protected:
-    void loadWeight(const MNN::Convolution2D *conv, bool loadWeightInt8 = false);
+    void loadWeight(const MNN::Op *op, bool loadWeightInt8 = false);
 
     virtual std::shared_ptr<MNN::Tensor> weightTransform(int group, int oc, int ic, int kh, int kw, const float *src, bool int8Weight = false, bool int4Weight = false);
 
diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm
index 548aae2ef..318c138eb 100644
--- a/source/backend/metal/MetalConvolutionCommon.mm
+++ b/source/backend/metal/MetalConvolutionCommon.mm
@@ -146,15 +146,19 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
     return dequantScale;
 }
-void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv, bool loadWeightInt8) {
+void MetalConvolutionCommon::loadWeight(const MNN::Op *op, bool loadWeightInt8) {
+    auto conv = op->main_as_Convolution2D();
     std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
     if (loadWeightInt8) {
-        qnt          = ConvolutionCommon::load(conv, backend(), false, true);
+        qnt = ConvolutionCommon::load(op, backend(), false, true);
     } else if (conv->quanParameter()) {
-        qnt = ConvolutionCommon::load(conv, backend(), true);
+        qnt = ConvolutionCommon::load(op, backend(), true);
     }
     // param
     auto size   = qnt ? MAX(qnt->weight.size(), qnt->weightFloat.size()) : conv->weight()->size();
+    if (loadWeightInt8 && qnt->canUseInt4) {
+        size *= 2;
+    }
     auto common = conv->common();
     auto kw     = common->kernelX();
     auto kh     = common->kernelY();
@@ -185,6 +189,59 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     auto goc_4  = UP_DIV(goc, 4);
     auto gic_4  = UP_DIV(gic, 4);
     auto weight_len = group * ROUND_UP(goc_4, 4) * gic_4 * kw * kh * 16;
+
+    if (int4Weight) {
+        weight_len = UP_DIV(weight_len, 2);
+        std::shared_ptr<MNN::Tensor> weightLow(MNN::Tensor::createDevice<int8_t>({weight_len}));
+        auto res = backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
+        if (!res) {
+            MNN_ERROR("Memory alloc error!\n");
+            return nullptr;
+        }
+        auto srcPtr = (int8_t*)src;
+        auto buf = MetalBackend::getBuffer(weightLow.get());
+        auto dstPtr = (uint8_t*)[buf.first contents] + buf.second;
+        auto oc_4  = UP_DIV(oc, 4);
+        auto ic_4  = UP_DIV(ic, 4);
+        if (group == 1 && kh == 1 && kw == 1) {
+            // fast int4 reorder
+            for (int i = 0; i < oc; i++) {
+                auto zo = i / 4, ro = i % 4;
+                for (int j = 0; j < ic; j++) {
+                    auto zi = j / 4, ri = j % 4;
+                    dstPtr[((zo * ic_4 + zi) * 16 + ro * 4 + ri) / 2] = srcPtr[(i * ic + j) / 2];
+                }
+            }
+        } else {
+            // slow int4 reorder
+            int sx = 0;
+            auto goc_4  = UP_DIV(goc, 4);
+            auto gic_4  = UP_DIV(gic, 4);
+            ::memset(dstPtr, 0, weight_len);
+            for (int g = 0; g < group; g++) {
+                for (int o = 0; o < goc; o++) {
+                    auto zo = o / 4, ro = o % 4;
+                    for (int i = 0; i < gic; i++) {
+                        auto zi = i / 4, ri = i % 4;
+                        for (int h = 0; h < kh; h++) {
+                            for (int w = 0; w < kw; w++) {
+                                // to   [g][o/4][i/4][h][w][16]
+                                // from [g][o][i][h][w]
+                                int dx = g * goc_4 * gic_4 * kh * kw * 16 + zo * gic_4 * kh * kw * 16 + ro * 4 + zi * kh * kw * 16 + ri + (h * kw + w) * 16;
+                                uint8_t s = srcPtr[sx/2];
+                                s = (sx % 2) ? (s & 0xf) : (s >> 4);
+                                s = (dx % 2) ? s : (s << 4);
+                                dstPtr[dx/2] |= s;
+                                sx++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return weightLow;
+    }
+
     std::shared_ptr<MNN::Tensor> t(MNN::Tensor::createDevice<float>({weight_len}));
     if (int8Weight || int4Weight) {
         t.reset(MNN::Tensor::createDevice<int8_t>({weight_len}));
@@ -195,33 +252,14 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
     auto buffer = MetalBackend::getBuffer(t.get());
     auto dst = (uint8_t*)[buffer.first contents] + buffer.second;
-
-    if (int8Weight || int4Weight) {
+    if (int8Weight) {
         weightInBlock<int8_t, int8_t>(group, oc, ic, kh, kw, (int8_t*)src, dst);
     } else if (backend->useFp16InsteadFp32()) {
         weightInBlock<float, __fp16>(group, oc, ic, kh, kw, src, dst);
     } else {
         weightInBlock<float, float>(group, oc, ic, kh, kw, src, dst);
     }
-    if (int4Weight) {
-        weight_len = UP_DIV(weight_len, 2);
-        std::shared_ptr<MNN::Tensor> weightLow(MNN::Tensor::createDevice<int8_t>({weight_len}));
-        auto res = backend->onAcquireBuffer(weightLow.get(), Backend::STATIC);
-        if (!res) {
-            MNN_ERROR("Memory alloc error!\n");
-            return nullptr;
-        }
-        auto srcPtr = (int8_t*)dst;
-        auto buf = MetalBackend::getBuffer(weightLow.get());
-        auto dstPtr = (uint8_t*)[buf.first contents] + buf.second;
-        for (int i=0; i < weight_len; ++i) {
-            int s0 = srcPtr[2 * i + 0];
-            int s1 = srcPtr[2 * i + 1];
-            int d = (s0 + 8) * 16 + (s1 + 8);
-            dstPtr[i] = d;
-        }
-        return weightLow;
-    }
+
     return t;
 }
 
diff --git a/source/backend/metal/MetalConvolutionDepthwise.mm b/source/backend/metal/MetalConvolutionDepthwise.mm
index 546896bb9..85b17c88f 100755
--- a/source/backend/metal/MetalConvolutionDepthwise.mm
+++ b/source/backend/metal/MetalConvolutionDepthwise.mm
@@ -15,7 +15,7 @@
 namespace MNN {
 MetalConvolutionDepthwise::MetalConvolutionDepthwise(Backend *backend, const MNN::Op *op)
     : MetalConvolutionCommon(backend, op, nullptr) {
-    loadWeight(op->main_as_Convolution2D());
+    loadWeight(op);
 }
 
 ErrorCode MetalConvolutionDepthwise::onResize(const std::vector<Tensor *> &inputs,
@@ -60,14 +60,14 @@
     mConstBuffer = backend->getConstBuffer(sizeof(constants));
 
     ::memcpy(mConstBuffer.contents, constants, sizeof(constants));
-    
+
     auto context = (__bridge MNNMetalContext *)backend->context();
     mPipeline = [context pipelineWithName:@"conv_depthwise" fp16:backend->useFp16InsteadFp32()];
-            
+
     NSUInteger gid_x = ow;
     NSUInteger gid_y = oh;
     NSUInteger gid_z = oc_4*ob;
-            
+
     NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                     (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
                     mConstBuffer, (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
diff --git a/source/backend/metal/MetalConvolutionWinograd.mm b/source/backend/metal/MetalConvolutionWinograd.mm
index dcc321782..a7db2c4cd 100644
--- a/source/backend/metal/MetalConvolutionWinograd.mm
+++ b/source/backend/metal/MetalConvolutionWinograd.mm
@@ -44,7 +44,7 @@
     auto conv = op->main_as_Convolution2D();
     mSrcUnit  = UNIT + conv->common()->kernelY() - 1;
     mDstUnit  = UNIT;
-    loadWeight(conv);
+    loadWeight(op);
 }
 MetalConvolutionWinograd::MetalConvolutionWinograd(Backend *backend, const MNN::Op *op, std::shared_ptr<Tensor> weight, std::shared_ptr<Tensor> bias) : MetalConvolutionCommon(backend, op, bias) {
     auto conv = op->main_as_Convolution2D();
@@ -81,7 +81,7 @@
     auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common());
     auto padX = pads.first;
     auto padY = pads.second;
-    
+
     // create const buffer
     struct TransformBuffer {
         int inputSize[4];
diff --git a/source/backend/metal/MetalDeconvolution.mm b/source/backend/metal/MetalDeconvolution.mm
index 38921d690..4338d9e30 100755
--- a/source/backend/metal/MetalDeconvolution.mm
+++ b/source/backend/metal/MetalDeconvolution.mm
@@ -145,7 +145,7 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
     // forcy downgrade to float like what CPU does
     std::shared_ptr<ConvolutionCommon::Int8Common> qnt = NULL;
     if (deconv->quanParameter()) {
-        qnt = ConvolutionCommon::load(deconv, backend, true);
+        qnt = ConvolutionCommon::load(op, backend, true);
     }
     auto kw     = common->kernelX();
     auto kh     = common->kernelY();
@@ -195,7 +195,7 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
     auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());
     const int padX  = pad.first;
     const int padY = pad.second;
-    
+
     // const buffer
     auto deltaKy = leastCommonMultiple(mDilateY, mStrideY) / mDilateY;
     auto deltaKx = leastCommonMultiple(mDilateX, mStrideX) / mDilateX;
@@ -227,7 +227,7 @@ void weightForDeconv(std::shared_ptr<MNN::Tensor> t, bool depthwise, const Convo
         mActivationType
     };
     mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly];
-    
+
     mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)];
     return NO_ERROR;
 }
diff --git a/source/backend/nnapi/execution/NNAPIConvolution.cpp b/source/backend/nnapi/execution/NNAPIConvolution.cpp
index 3c064b141..4aecae0d2 100644
--- a/source/backend/nnapi/execution/NNAPIConvolution.cpp
+++ b/source/backend/nnapi/execution/NNAPIConvolution.cpp
@@ -95,7 +95,7 @@ ErrorCode NNAPIConvolution::onResize(const std::vector<Tensor *> &inputs, const
             weightPtr = conv2D->quanParameter()->buffer()->data();
             weightSize = conv2D->quanParameter()->buffer()->size();
         } else if (nullptr != conv2D->quanParameter()) {
-            quanCommon = ConvolutionCommon::load(conv2D, backend(), true);
+            quanCommon = ConvolutionCommon::load(mOp, backend(), true);
             if (nullptr == quanCommon) {
                 MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
             }
diff --git a/source/backend/opencl/CMakeLists.txt b/source/backend/opencl/CMakeLists.txt
index ace6942cf..fc2fbdf1e 100644
--- a/source/backend/opencl/CMakeLists.txt
+++ b/source/backend/opencl/CMakeLists.txt
@@ -26,7 +26,7 @@ ENDIF()
 if (${CMAKE_SYSTEM_NAME} MATCHES "Android")
     add_definitions(-DMNN_USE_LIB_WRAPPER)
     add_definitions(-DMNN_OPENCL_SVM_ENABLE)
-    add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=120)
+    add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=110)
 else()
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR MNN_SUPPORT_INTEL_SUBGROUP)
         add_definitions(-DMNN_SUPPORT_INTEL_SUBGROUP)
@@ -36,7 +36,7 @@ else()
     else()
         add_definitions(-DMNN_USE_LIB_WRAPPER)
         add_definitions(-DMNN_OPENCL_SVM_ENABLE)
-        add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=120)
+        add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=110)
     endif()
 endif()
 IF(MNN_SEP_BUILD)
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index d9bc65fab..b08f6a5cf 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -596,7 +596,7 @@ void OpenCLBackend::_allocHostBuffer(int length, const Tensor* srcTensor) const
         mDeviceBuffer = (cl::Buffer*)srcTensor->buffer().device;
     }
 #ifdef  __ANDROID__
-    else if(memType == MNN_FORWARD_OPENGL){
+    else if(memType == MNN_FORWARD_OPENGL && mOpenCLRuntime->isSupportGL()){
         cl_int error;
         mDeviceTexture.reset(new cl::ImageGL(mOpenCLRuntime->context(), CL_MEM_READ_WRITE, GL_TEXTURE_2D, 0, (cl_GLuint)srcTensor->buffer().device, &error));
         std::vector<cl::Memory> map = {*mDeviceTexture.get()};
@@ -671,7 +671,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
 #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
-        if(MNN_FORWARD_OPENGL == memtype){
+        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
             OpenCL::convertNC4HW4BufferToImage(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, svmFlag);
             std::vector<cl::Memory> map = {openCLImage(dstTensor)};
             mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
@@ -722,7 +722,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
     else
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
     {
-        if(MNN_FORWARD_OPENGL == memtype){
+        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
             std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor);
 
             mOpenCLRuntime.get()->commandQueue().enqueueCopyImage(
@@ -784,7 +784,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
     #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
-        if(MNN_FORWARD_OPENGL == memtype){
+        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
             OpenCL::convertImageToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor),mOpenCLRuntime.get(), false, svmFlag);
             std::vector<cl::Memory> map = {openCLImage(srcTensor)};
             mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL);
@@ -821,7 +821,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
     else
     #endif /* MNN_OPENCL_BUFFER_CLOSED */
     {
-        if(MNN_FORWARD_OPENGL == memtype){
+        if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){
             std::vector<int> bufferShape = MNN::OpenCL::tensorShapeFormat(dstTensor);
 
             mOpenCLRuntime.get()->commandQueue().enqueueCopyImage(
@@ -880,7 +880,11 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
         mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
     }
     #else
-    mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
+    auto res = mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr);
+    if(res != CL_SUCCESS) {
+	MNN_ERROR("OpenCL enqueue write error:%d\n", res);
+	return;
+    }
     #endif
 
     //Covert format
@@ -902,6 +906,10 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
             MNN_PRINT("Unsupport ForwardType %d for OpenCL backend!\n", memType);
             return;
         }
+        if(mOpenCLRuntime->isSupportGL() && MNN_FORWARD_OPENGL == memType){
+            MNN_PRINT("This Device can not find OpenCL GL_EXTENTION function!\n");
+            return;
+        }
         _allocHostBuffer(0, copyTensor);
 
         MNN::Tensor interTensor(copyTensor, copyTensor->getDimensionType(), false);
@@ -912,10 +920,6 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
         }else{
             interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
         }
-        if(OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isGlError() && MNN_FORWARD_OPENGL == memType){
-            MNN_PRINT("This Device can not find OpenCL GL_EXTENTION function!\n");
-            return;
-        }
         //Covert format
         MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(copyTensor)->dimensionFormat;
         if(MNN_FORWARD_CPU != srcMemtype){
diff --git a/source/backend/opencl/core/OpenCLGemmTune.cpp b/source/backend/opencl/core/OpenCLGemmTune.cpp
index 00cd3ed98..388fba6f0 100644
--- a/source/backend/opencl/core/OpenCLGemmTune.cpp
+++ b/source/backend/opencl/core/OpenCLGemmTune.cpp
@@ -135,7 +135,7 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
     MNN_ASSERT(gemmSize[1] % 16 == 0);
     MNN_ASSERT(gemmSize[2] % 4 == 0);
 
-    MNN_ASSERT((gemmSize[5] == 0 && tensorMemory.size() == 3) || (gemmSize[5] == 1 && tensorMemory.size() == 4));
+    MNN_ASSERT((gemmSize[5] == 0 && tensorMemory.size() == 3) || (gemmSize[5] >= 1 && tensorMemory.size() == 4));
     auto& tunedGemmParams = runtime->tunedGemmParamsMap();
     
     std::vector<uint32_t> info(gemmSize);
@@ -292,8 +292,8 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
                 buildOptions.emplace(" -DRELAX_WORKGROUP_SIZE=1");
             }
             
-            if(gemmSize[5] == 1) {
-                buildOptions.emplace(" -DBIAS");
+            if(gemmSize[5] >= 1) {
+                buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string((int)gemmSize[5]));
             }
 
             int localM = mdimc;
@@ -346,6 +346,8 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
                 if(gemmSize[5] == 1) {
                     ret |= kernel->get().setArg(idx++, tensorMemory[3]);
                     ret |= kernel->get().setArg(idx++, gemmSize[1]);
+                } else if(gemmSize[5] > 1) {
+                    MNN_ERROR("BatchGemm with bias type > 1 (elementwise) not supported! please check\n");
                 }
                 ret |= kernel->get().setArg(idx++, tensorMemory[2]);
                 ret |= kernel->get().setArg(idx++, batch_offset_c);
@@ -362,16 +364,19 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
                 int offset_a = 0;
                 int offset_b = 0;
                 int offset_c = 0;
-                
+                int offset[4] = {0, 0, 0, 0};
+                int stride[4] = {(int)gemmSize[0], (int)gemmSize[1], (int)gemmSize[1], (int)gemmSize[1]};
+                if(gemmSize[3] < 4) {
+                    stride[2] = gemmSize[0]; // output: [N, M]
+                }
                 ret |= kernel->get().setArg(idx++, tensorMemory[0]);
                 ret |= kernel->get().setArg(idx++, tensorMemory[1]);
-                if(gemmSize[5] == 1) {
+                if(gemmSize[5] >= 1) {
                     ret |= kernel->get().setArg(idx++, tensorMemory[3]);
                 }
                 ret |= kernel->get().setArg(idx++, tensorMemory[2]);
-                ret |= kernel->get().setArg(idx++, offset_a);
-                ret |= kernel->get().setArg(idx++, offset_b);
-                ret |= kernel->get().setArg(idx++, offset_c);
+                ret |= kernel->get().setArg(idx++, offset);
+                ret |= kernel->get().setArg(idx++, stride);
                 
                 MNN_CHECK_CL_SUCCESS(ret, "setArg getGemmParams Xgemm Kernel");
                 
diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp
index 3898224d3..549d2be10 100644
--- a/source/backend/opencl/core/OpenCLRunningUtils.cpp
+++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp
@@ -589,5 +589,21 @@ bool localWSTune(const std::map<std::string, std::vector<std::pair<std::vector<u
     return true;
 }
 
+bool getPreParamInfo(const std::string preParamName, uint32_t *preParamData,  OpenCLRuntime *runtime){
+    auto& preParamInfo = runtime->preParamsMap();
+    if (preParamInfo.find(preParamName) != preParamInfo.end()) {
+        *preParamData = preParamInfo[preParamName];
+        return true;
+    }
+    return false;
+}
+
+void setPreParamInfo(const std::string preParamName, uint32_t preParamData,  OpenCLRuntime *runtime){
+    auto& preParamInfo = runtime->preParamsMap();
+    if (preParamInfo.find(preParamName) == preParamInfo.end()) {
+        preParamInfo.insert(std::make_pair(preParamName, preParamData));
+    }
+}
+
 } // namespace OpenCL
 } // namespace MNN
diff --git a/source/backend/opencl/core/OpenCLRunningUtils.hpp b/source/backend/opencl/core/OpenCLRunningUtils.hpp
index 63c3fd7df..f9a911beb 100644
--- a/source/backend/opencl/core/OpenCLRunningUtils.hpp
+++ b/source/backend/opencl/core/OpenCLRunningUtils.hpp
@@ -126,6 +126,10 @@ bool localWSTune(const std::map<std::string, std::vector<std::pair<std::vector<u
 std::pair<std::vector<uint32_t>, uint32_t> localWS2DDefault(const std::vector<uint32_t> &gws, const uint32_t maxWorkGroupSize,
                                        OpenCLRuntime *runtime, const std::string &kernelName, const std::shared_ptr<KernelWrap> &mKernel);
 
+bool getPreParamInfo(const std::string preParamName, uint32_t *preParamData,  OpenCLRuntime *runtime);
+
+void setPreParamInfo(const std::string preParamName, uint32_t preParamData,  OpenCLRuntime *runtime);
+
 void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h);
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index 7d6c4e5de..51ba62619 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -195,6 +195,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
             mIsDeviceSupportedLowPower = (mIsDeviceSupportedLowPower || isPriorityHint);
             
             #ifdef MNN_USE_LIB_WRAPPER
+            mIsSupportGL = !OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isGlError();
             if(isPriorityHint)
             {
                 if(true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isPropError())
@@ -413,6 +414,10 @@ unsigned int OpenCLRuntime::getQueueNum() {
     return mQueueCount;
 }
 
+std::map<std::string, uint32_t>& OpenCLRuntime::preParamsMap(){
+    return mPreParams;
+}
+
 std::map<std::vector<uint32_t>, std::vector<uint32_t>>& OpenCLRuntime::tunedGemmParamsMap() {
     return mTunedGemmParams;
 }
@@ -863,6 +868,14 @@ std::pair<const void*, size_t> OpenCLRuntime::makeCache(void* tuneInfo) {
         cache->gemm.emplace_back(std::move(tuning));
     }
     
+    // Get All PreParam cache
+    for(auto& iter : mPreParams){
+        std::unique_ptr<PreParamInfoT> info(new PreParamInfoT);
+        info->preParamName = iter.first;
+        info->preParamData = iter.second;
+        cache->preParam.emplace_back(std::move(info));
+    }
+    
     flatbuffers::FlatBufferBuilder builder;
     auto lastOffset = Cache::Pack(builder, cache.get());
     builder.Finish(lastOffset);
@@ -964,6 +977,19 @@ bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
             mTunedGemmParams.insert(std::make_pair(info, params));
         }
     }
+    
+    //Load PreParam Info
+    if(nullptr != cacheBuffer->preParam()){
+        auto preParamInfo = cacheBuffer->preParam();
+        for(int i = 0; i < preParamInfo->size(); ++i){
+            auto info = preParamInfo->GetAs<PreParamInfo>(i);
+            if (nullptr == info->preParamName()) {
+                MNN_ERROR("Error preParam info\n");
+                return false;
+            }
+            mPreParams.insert(std::make_pair(info->preParamName()->str(), info->preParamData()));
+        }
+    }
     return true;
 }
 
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
index 13f40a6c6..4a85de8bd 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -109,6 +109,9 @@ class OpenCLRuntime {
     float getCLVersion() {
         return mCLVersion;
     }
+    bool isSupportGL(){
+    	return mIsSupportGL;
+	}
 #ifdef MNN_OPENCL_SVM_ENABLE
     cl_device_svm_capabilities getSvmCapabilities() {
         return mSvmCapabilities;
@@ -141,6 +144,8 @@ class OpenCLRuntime {
     unsigned int mKernelTime = 0;
     
     
+    std::map<std::string, uint32_t>& preParamsMap();
+    
     std::map<std::vector<uint32_t>, std::vector<uint32_t>>& tunedGemmParamsMap();
 
     std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>, uint32_t>>& tunedLwsMap();
@@ -209,6 +214,7 @@ class OpenCLRuntime {
     bool mSupportDotInt8 = false;
     bool mSupportDotAccInt8 = false;
     bool mSupportedIntelSubgroup = false;
+    bool mIsSupportGL = true;
     GpuType mGpuType;
     MaliAr mMaliAr;
     float mCLVersion = 1.0f;
@@ -228,6 +234,7 @@ class OpenCLRuntime {
     double mStartNanos;
     double mStopNanos;
 
+    std::map<std::string, uint32_t> mPreParams;
     std::map<std::vector<uint32_t>, std::vector<uint32_t>> mTunedGemmParams;
     std::map<std::pair<std::string, std::vector<uint32_t>>, std::pair<std::vector<uint32_t>,  uint32_t>> mTunedLws;
     std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>,  uint32_t>>>> mTuneLws;
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index deaf9b627..8dc9957cf 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -127,9 +127,6 @@ bool OpenCLSymbols::isGlError() {
     return mGlError;
 }
 
-bool OpenCLSymbols::isCL1_2Error() {
-    return mCL_12Error;
-}
 
 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
 #if defined(WIN32)
@@ -157,11 +154,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
         mQcomError = true; \
     }
 
-#define MNN_LOAD_CL_12_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(GetProcAddress(handle_, #func_name)); \
-    if(func_name == nullptr){ \
-        mCL_12Error = true; \
-    }
-
 #define MNN_LOAD_GL_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(GetProcAddress(handle_, #func_name)); \
     if(func_name == nullptr){ \
         mGlError = true; \
@@ -213,14 +205,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
         mQcomError = true; \
     }
 
-#define MNN_LOAD_CL_12_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(dlsym(handle_, #func_name)); \
-    if(func_name == nullptr && loadOpenCLPointer != nullptr){ \
-        func_name = reinterpret_cast<func_name##Func>(loadOpenCLPointer(#func_name)); \
-    } \
-    if(func_name == nullptr){ \
-        mCL_12Error = true; \
-    }
-
 #define MNN_LOAD_GL_PTR(func_name) func_name = reinterpret_cast<func_name##Func>(dlsym(handle_, #func_name)); \
     if(func_name == nullptr && loadOpenCLPointer != nullptr){ \
         func_name = reinterpret_cast<func_name##Func>(loadOpenCLPointer(#func_name)); \
@@ -282,9 +266,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
     MNN_LOAD_GL_PTR(clCreateFromGLTexture);
     MNN_LOAD_GL_PTR(clEnqueueAcquireGLObjects);
     MNN_LOAD_GL_PTR(clEnqueueReleaseGLObjects);
-    MNN_LOAD_CL_12_PTR(clCreateImage);
-    MNN_LOAD_CL_12_PTR(clRetainDevice);
-    MNN_LOAD_CL_12_PTR(clReleaseDevice);
 
     MNN_LOAD_PROP_PTR(clCreateCommandQueueWithProperties);
     MNN_LOAD_SVM_PTR(clSVMAlloc);
@@ -664,12 +645,6 @@ cl_int CL_API_CALL clFinish(cl_command_queue command_queue) {
     return func(command_queue);
 }
 
-cl_mem CL_API_CALL clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) {
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateImage;
-    MNN_CHECK_NOTNULL(func);
-    return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
-}
-
 cl_mem CL_API_CALL clCreateImage2D(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t imageWidth,
                        size_t imageHeight, size_t image_row_pitch, void *host_ptr, cl_int *errcode_ret) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateImage2D;
@@ -740,18 +715,6 @@ cl_int CL_API_CALL clEnqueueReleaseGLObjects(cl_command_queue command_queue,
     return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event);
 }
 
-cl_int CL_API_CALL clRetainDevice(cl_device_id device){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clRetainDevice;
-    MNN_CHECK_NOTNULL(func);
-    return func(device);
-}
-
-cl_int CL_API_CALL clReleaseDevice(cl_device_id device){
-    auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clReleaseDevice;
-    MNN_CHECK_NOTNULL(func);
-    return func(device);
-}
-
 // clCreateCommandQueueWithProperties wrapper
 cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device, const cl_queue_properties *properties, cl_int *errcode_ret) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateCommandQueueWithProperties;
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
index e3617d92d..561ccde8c 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
@@ -17,9 +17,8 @@
 #endif
 #include <memory>
 #include "core/Macro.h"
-#define CL_TARGET_OPENCL_VERSION 200
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
 
 #if !defined(_MSC_VER)
 #pragma GCC diagnostic push
@@ -54,7 +53,6 @@ class OpenCLSymbols {
     bool isSvmError();
     bool isPropError();
     bool isQcomError();
-    bool isCL1_2Error();
     bool isGlError();
     
     using clGetPlatformIDsFunc        = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *, cl_uint *);
@@ -188,7 +186,6 @@ class OpenCLSymbols {
     MNN_CL_DEFINE_FUNC_PTR(clReleaseKernel);
     MNN_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
     MNN_CL_DEFINE_FUNC_PTR(clCreateBuffer);
-    MNN_CL_DEFINE_FUNC_PTR(clCreateImage);
     MNN_CL_DEFINE_FUNC_PTR(clCreateImage2D);
     MNN_CL_DEFINE_FUNC_PTR(clRetainKernel);
     MNN_CL_DEFINE_FUNC_PTR(clCreateKernel);
@@ -232,8 +229,6 @@ class OpenCLSymbols {
     MNN_CL_DEFINE_FUNC_PTR(clCreateFromGLTexture);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueAcquireGLObjects);
     MNN_CL_DEFINE_FUNC_PTR(clEnqueueReleaseGLObjects);
-    MNN_CL_DEFINE_FUNC_PTR(clRetainDevice);
-    MNN_CL_DEFINE_FUNC_PTR(clReleaseDevice);
     
     MNN_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
     MNN_CL_DEFINE_FUNC_PTR(clSVMAlloc);
diff --git a/source/backend/opencl/execution/buffer/CastBufExecution.cpp b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
index ee7e51d35..dd4debd80 100644
--- a/source/backend/opencl/execution/buffer/CastBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
@@ -60,6 +60,7 @@ ErrorCode CastBufExecution::onEncode(const std::vector<Tensor*>& inputs, const s
     openCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
index 8eb739f28..ba25bda93 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@@ -36,7 +36,7 @@ ConvBufCommonExecution::ConvBufCommonExecution(const Convolution2D *conv2dParams
     mResource->mBias.reset(Tensor::createDevice<float>({1, 1, 1, ROUND_UP(biasSize, 32)}));
     backend->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC);
     cl::Buffer &biasBuffer = openCLBuffer(mResource->mBias.get());
-    
+
     cl_int res;
     auto biasPtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(
         biasBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
@@ -103,7 +103,7 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
     auto padding = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mResource->mConv2dCommonParams);
     mPaddings[0] = padding.second;//padY
     mPaddings[1] = padding.first;//padX
-        
+
     mResource->mKernelWidth   = conv2dCommonParams->kernelX();
     mResource->mKernelHeight  = conv2dCommonParams->kernelY();
     mResource->mOutputChannel = conv2dCommonParams->outputCount();
@@ -116,7 +116,7 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
         mResource->mRasterExe.reset(new RasterBufExecution({mResource->mFilter.get()}, op, mOpenCLBackend));
     } else {
         int weightSize   = 0;
-        ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &mFilterDataPtr, &weightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &mFilterDataPtr, &weightSize);
         //select opt conv method
         bool isConv1x1 = (mResource->mKernelHeight == mResource->mKernelWidth && mResource->mKernelHeight == 1 && mPaddings[0] == 0 &&
                           mPaddings[1] == 0 && mResource->mStrides[0] == 1 && mResource->mStrides[1] == 1);
@@ -132,7 +132,6 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
         // Tile Match with mConvGemmOptLevel == 2
         int tileK = 4;
         int tileN = 32;
-        
         int buffer_size = ROUND_UP(mResource->mOutputChannel, tileN) * ROUND_UP(mResource->mInputChannel, tileK);
         mResource->mFilter.reset(
             Tensor::createDevice<float>({buffer_size}));
@@ -176,7 +175,7 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
             std::vector<int> filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)};
             std::shared_ptr<Tensor> filterBuffer(
                 Tensor::createDevice<float>({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight}));
-            
+
             int buffer_size = filterBuffer->elementSize() * sizeof(float);
             cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
             filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
@@ -199,12 +198,12 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
             mResource->mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
             mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC);
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
-            
+
             bool needTrans = true;
             bufferConvertor.convertToNC4HW4Buffer(filterBuffer.get(), MNN::OpenCL::CONV2D_FILTER, mResource->mFilter.get(), needTrans);
         }
     }
-        
+
     if (mResource->mConv2dCommonParams->relu()) {
         mResource->mBuildOptions.emplace("-DRELU");
     } else if (mResource->mConv2dCommonParams->relu6()) {
@@ -270,17 +269,17 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
     auto padding = ConvolutionCommon::convolutionPad(input, output, mResource->mConv2dCommonParams);
     mPaddings[0] = padding.second;//padY
     mPaddings[1] = padding.first;//padX
-    
+
     // printf("nchw %d %d %d %d, cohw %d %d %d, khw %d %d  gemm:%d \n", inputs[0]->batch(), inputs[0]->channel(), inputs[0]->height(), inputs[0]->width(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width(), mResource->mKernelWidth, mResource->mKernelHeight, mResource->mConvGemmOptLevel);
-    
+
     std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel) + "_" + std::to_string(mResource->mKernelHeight) + "_" + std::to_string(mResource->mKernelWidth) + "_" + std::to_string(mResource->mStrides[0]) + "_" + std::to_string(mResource->mStrides[1]) + "_" + std::to_string(mResource->mDilations[0]) + "_" + std::to_string(mResource->mDilations[1]);
-    
+
     if (mResource->mConvGemmOptLevel > 0) {
         int area = height * width;
         int M = outputShape.at(0) * area;
         int N = outputShape.at(3);
         int K = inputShape.at(3);
-        
+
         bool isAlign = (K % 8 == 0 && area == 1 && N % 64 == 0 && M % 64 == 0);
         bool isLimitSize = (M * 1.0 / 512 * N / 512 * K / 512 <= 1.0) && (1.0 * M * K / N / N >= 16.0);
         if(isAlign && isLimitSize) {
@@ -289,7 +288,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             mResource->mConvGemmOptLevel = 0;
         }
     }
-    
+
     if (mResource->mConvGemmOptLevel == 2) {
         // set large tile
         int tileM = 16;
@@ -300,27 +299,22 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         int M = outputShape.at(0) * area;
         int N = outputShape.at(3);
         int K = inputShape.at(3);
-        
+
         int alignM = ROUND_UP(M, tileM);
         int alignN = ROUND_UP(N, tileN);
         int alignK = ROUND_UP(K, tileK);
-        
+
         // ReArrange input
         mConvGemmInpTensor.reset(Tensor::createDevice<float>({alignK * alignM}));
         mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-        if(N != alignN || M != alignM || area != 1) {
-            mNeedOutTempTensor = true;
-            mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
-            mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
-        }
-        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-        if(mNeedOutTempTensor) {
-            mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
-        }
         
+        mNeedOutTempTensor = true;
+        mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
+        mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
+
         {
             std::set<std::string> buildOptions;
-            
+
             int m_pack = 1;
             if(area == 1) {
                 m_pack = 4;
@@ -352,89 +346,15 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             mPreGlobalWorkSize[0] = ROUND_UP(mPreGlobalWorkSize[0], std::max((uint32_t)1, mPreLocalWorkSize[0]));
             mPreGlobalWorkSize[1] = ROUND_UP(mPreGlobalWorkSize[1], std::max((uint32_t)1, mPreLocalWorkSize[1]));
         }
-        std::set<std::string> buildOptions;
-        
-        uint32_t hasBias = 0;
-        if(!mNeedOutTempTensor) {
-            hasBias = 1;
-            buildOptions = mResource->mBuildOptions;
-            buildOptions.emplace("-DBIAS");
-        }
-        uint32_t layout = 4;
-        uint32_t batch = 1;
-        
-        cl::Buffer outBuffer = mNeedOutTempTensor ? openCLBuffer(mConvGemmOutTensor.get()) : openCLBuffer(output);
-        std::vector<uint32_t> param;
-        if(mNeedOutTempTensor) {
-            param = getGemmParams({(uint32_t)alignM, (uint32_t)alignN, (uint32_t)alignK, layout, batch, hasBias}, {openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(mConvGemmOutTensor.get())}, mOpenCLBackend->getOpenCLRuntime());
-        } else {
-            param = getGemmParams({(uint32_t)alignM, (uint32_t)alignN, (uint32_t)alignK, layout, batch, hasBias}, {openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(output), openCLBuffer(mResource->mBias.get())}, mOpenCLBackend->getOpenCLRuntime());
-        }
 
-        int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
-        buildOptions.emplace("-DKWG=" + std::to_string(KWG));
-        buildOptions.emplace("-DKWI=" + std::to_string(KWI));
-        buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
-        buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
-        buildOptions.emplace("-DMWG=" + std::to_string(MWG));
-        buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
-        buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
-        buildOptions.emplace("-DNWG=" + std::to_string(NWG));
-        buildOptions.emplace("-DSA=" + std::to_string(SA));
-        buildOptions.emplace("-DSB=" + std::to_string(SB));
-        buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
-        buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
-        buildOptions.emplace("-DVWM=" + std::to_string(VWM));
-        buildOptions.emplace("-DVWN=" + std::to_string(VWN));
-        if(layout >= 4) {
-            buildOptions.emplace("-DOUTPUTMN");
-        }
-
-        tileM = MWG;
-        tileN = NWG;
-        int localM = MDIMC;
-        int localN = NDIMC;
-                 
-        if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
-            buildOptions.emplace("-DUSE_CL_MAD=1");
-            buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
-        }
-
-        mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "Xgemm", buildOptions);
-        
-        int out_per_thread_m = tileM / localM;
-        int out_per_thread_n = tileN / localN;
-        
-        mGlobalWorkSize = {static_cast<uint32_t>(alignM/out_per_thread_m), static_cast<uint32_t>(alignN/out_per_thread_n)};
-        mLocalWorkSize = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN)};
-        
-        float alpha = 1.0;
-        float beta = 0.0f;
-        int offset = 0;
-        int idx            = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(alignM));
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(alignN));
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(alignK));
-        ret |= mKernel->get().setArg(idx++, alpha);
-        ret |= mKernel->get().setArg(idx++, beta);
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get()));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
-        if(mNeedOutTempTensor) {
-            ret |= mKernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
-        } else {
-            ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
-            ret |= mKernel->get().setArg(idx++, openCLBuffer(output));
+        // call gemm strassen
+        {
+            mStrassenComputor.reset(new StrassenMatrixComputor(backend(), 3));
+            mStrassenComputor->onEncode(alignM, alignK, alignN, alignM, alignN, alignN, openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(mConvGemmOutTensor.get()),
+                                         false, openCLBuffer(mResource->mBias.get()));
         }
-        ret |= mKernel->get().setArg(idx++, offset);
-        ret |= mKernel->get().setArg(idx++, offset);
-        ret |= mKernel->get().setArg(idx++, offset);
-        
-        MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf mConvgemmOptLevel==2 Kernel Select");
-        mOpenCLBackend->recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize);
-        mGlobalWorkSize[0] = ROUND_UP(mGlobalWorkSize[0], std::max((uint32_t)1, mLocalWorkSize[0]));
-        mGlobalWorkSize[1] = ROUND_UP(mGlobalWorkSize[1], std::max((uint32_t)1, mLocalWorkSize[1]));
         
+        // call output transpose
         if(mNeedOutTempTensor) {
             std::set<std::string> buildOptions = mResource->mBuildOptions;
             if(area == 1) {
@@ -464,9 +384,15 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             mOpenCLBackend->recordKernel2d(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize);
             mPostGlobalWorkSize[0] = ROUND_UP(mPostGlobalWorkSize[0], std::max((uint32_t)1, mPostLocalWorkSize[0]));
             mPostGlobalWorkSize[1] = ROUND_UP(mPostGlobalWorkSize[1], std::max((uint32_t)1, mPostLocalWorkSize[1]));
-            
+
             mOpenCLBackend->endRecord(mRecording);
         }
+        
+        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+        if(mNeedOutTempTensor) {
+            mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
+        }
+        
         return NO_ERROR;
     } else if (mResource->mConvGemmOptLevel == 1) {
         // set small tile
@@ -489,11 +415,11 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             buildOptions.emplace(" -DOPWM=64 -DOPWN=64 -DCPWK=8 -DOPTM=4 -DOPTN=4");
         }
 
-        
+
         mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_local_buf", "matmul_local_buf", buildOptions);
         int out_per_thread_m = tileM / localM;
         int out_per_thread_n = tileN / localN;
-        
+
         mGlobalWorkSize = {static_cast<uint32_t>(M/out_per_thread_m), static_cast<uint32_t>(N/out_per_thread_n)};
         mLocalWorkSize = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN)};
 
@@ -509,14 +435,14 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
 
         MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf mConvgemmOptLevel==1 Kernel Select");
     } else if (mResource->mConv1x1Opt) {
-    
+
         int tileN = 32;
         // {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1", "conv_2d_1x1_c8h1w4"};
         const int total_kernel = 3;
         std::string kernelName[total_kernel] = {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1"};
         int itemC[total_kernel] = {4, 4, 4};
         int itemW[total_kernel] = {4, 2, 1};
-        
+
         int actual_kernel = total_kernel;
         if(mResource->mConv1x1C8Opt) {
             actual_kernel = 2;
@@ -528,7 +454,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             itemC[1]      = 8;
             itemW[1]      = 2;
         }
-        
+
         std::shared_ptr<KernelWrap> kernel[total_kernel];
         std::vector<uint32_t> globalWorkSize[total_kernel];
         std::vector<uint32_t> localWorkSize[total_kernel];
@@ -543,11 +469,11 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             }
             kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-            
+
             uint32_t idx            = 0;
             cl_int ret = CL_SUCCESS;
             globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
-            
+
             ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
             ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]);
             ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx]));
@@ -575,7 +501,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
         int min_index  = min_cost.second;
         mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-        
+
         std::set<std::string> buildOption = mResource->mBuildOptions;
         if(outputShape.at(3) % itemC[min_index] != 0){
             buildOption.emplace("-DCHANNEL_LEAVE");
@@ -609,18 +535,18 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         int strideShape[2]      = {mResource->mStrides[0],mResource->mStrides[1]};
         int paddingShape[2]     = {mPaddings[0], mPaddings[1]};
         int dilationShape[2]    = {mResource->mDilations[0], mResource->mDilations[1]};
-        
+
         // {"conv_2d_c4h1w2", "conv_2d_c4h1w1", "conv_2d_c8h1w1", "conv_2d_c4h1w4", "conv_2d_c8h2w1", "conv_2d_c4h4w1"};
         const int total_kernel = 7;
         std::string kernelName[total_kernel] = {"conv_2d_c4h1w1", "conv_2d_c4h1w2", "conv_2d_c4h4w1", "conv_2d_c8h2w1", "conv_2d_c8h4w1", "conv_2d_c4h1w4", "conv_2d_c8h1w4"};
         int itemC[total_kernel] = {4, 4, 4, 8, 8, 4, 8};
         int itemH[total_kernel] = {1, 1, 4, 2, 4, 1, 1};
         int itemW[total_kernel] = {1, 2, 1, 1, 1, 4, 4};
-        
-        
+
+
         int actual_kernel = total_kernel;
-        
-        
+
+
         std::shared_ptr<KernelWrap> kernel[total_kernel];
         std::vector<uint32_t> globalWorkSize[total_kernel];
         std::vector<uint32_t> localWorkSize[total_kernel];
@@ -635,7 +561,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             }
             kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-            
+
             globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))};
             uint32_t idx            = 0;
             cl_int ret = CL_SUCCESS;
@@ -678,7 +604,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             buildOption.emplace("-DBLOCK_LEAVE");
         }
         mKernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption);
-        
+
         uint32_t idx            = 0;
         cl_int ret = CL_SUCCESS;
 
@@ -736,31 +662,36 @@ ErrorCode ConvBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
         runKernel2D(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event0);
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvBuf2D-gemm2-0", event0});
     }
-    cl::Event event;
-    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
-    std::string name = "ConvBuf2D";
-    std::string b = std::to_string(inputs[0]->batch());
-    std::string ci = std::to_string(inputs[0]->channel());
-    std::string hi = std::to_string(inputs[0]->height());
-    std::string wi = std::to_string(inputs[0]->width());
-    std::string co = std::to_string(outputs[0]->channel());
-    std::string ho = std::to_string(outputs[0]->height());
-    std::string wo = std::to_string(outputs[0]->width());
-    std::string kh = std::to_string(mResource->mKernelHeight);
-    std::string kw = std::to_string(mResource->mKernelWidth);
-    std::string total = std::to_string(1.0 / 1000000 * inputs[0]->batch() * inputs[0]->channel() * outputs[0]->channel() * outputs[0]->height() * outputs[0]->width() * mResource->mKernelHeight * mResource->mKernelWidth);
-    if (mResource->mConvGemmOptLevel > 0) {
-        std::string m = std::to_string(outputs[0]->width() * outputs[0]->height() * inputs[0]->batch());
-        name += "-gemm";
-        name += std::to_string(mResource->mConvGemmOptLevel) + "-m" + m + "n" + co + "k" + ci;
-    } else if (mResource->mConv1x1Opt) {
-        name += "-conv1x1";
-        name += "-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co;
+
+    if(mResource->mConvGemmOptLevel == 2) {
+        mStrassenComputor->onExecute();
     } else {
-        name += "-ori-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co+ "ho" + ho + "wo" + wo + "kh" + kh + "kw" + kw;
+        cl::Event event;
+        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
+        std::string name = "ConvBuf2D";
+        std::string b = std::to_string(inputs[0]->batch());
+        std::string ci = std::to_string(inputs[0]->channel());
+        std::string hi = std::to_string(inputs[0]->height());
+        std::string wi = std::to_string(inputs[0]->width());
+        std::string co = std::to_string(outputs[0]->channel());
+        std::string ho = std::to_string(outputs[0]->height());
+        std::string wo = std::to_string(outputs[0]->width());
+        std::string kh = std::to_string(mResource->mKernelHeight);
+        std::string kw = std::to_string(mResource->mKernelWidth);
+        std::string total = std::to_string(1.0 / 1000000 * inputs[0]->batch() * inputs[0]->channel() * outputs[0]->channel() * outputs[0]->height() * outputs[0]->width() * mResource->mKernelHeight * mResource->mKernelWidth);
+        if (mResource->mConvGemmOptLevel > 0) {
+            std::string m = std::to_string(outputs[0]->width() * outputs[0]->height() * inputs[0]->batch());
+            name += "-gemm";
+            name += std::to_string(mResource->mConvGemmOptLevel) + "-m" + m + "n" + co + "k" + ci;
+        } else if (mResource->mConv1x1Opt) {
+            name += "-conv1x1";
+            name += "-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co;
+        } else {
+            name += "-ori-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co+ "ho" + ho + "wo" + wo + "kh" + kh + "kw" + kw;
+        }
+        name += "-total:" + total + "*10^6";
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({name.c_str(), event});
     }
-    name += "-total:" + total + "*10^6";
-    mOpenCLBackend->getOpenCLRuntime()->pushEvent({name.c_str(), event});
     if (mPostKernel) {
         cl::Event event2;
         runKernel2D(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event2);
@@ -777,12 +708,17 @@ ErrorCode ConvBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
     if (mPreKernel) {
         runKernel2D(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
     }
-    runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    
+    if(mResource->mConvGemmOptLevel == 2) {
+        mStrassenComputor->onExecute();
+    } else {
+        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+    }
     if (mPostKernel) {
         runKernel2D(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
     }
 #endif
-    
+
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ConvExecution onExecute !\n");
 #endif
@@ -819,7 +755,7 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
                     return nullptr;
                 }
             }
-        }  
+        }
 #endif
         if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
             auto quan = op->main_as_Convolution2D()->quanParameter();
@@ -830,12 +766,12 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
                 }
             }
         }
-        
+
         if(op->main_as_Convolution2D()->common()->group() > 1){
             // Don't support group > 1 now
             return nullptr;
         }
-        
+
         if (inputs.size() > 1) {
             // Multi inputs
             for (int i = 0; i < inputs.size(); ++i) {
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
index e5abe2a53..96e1ec5aa 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
@@ -12,6 +12,8 @@
 #define ConvBufExecution_hpp
 
 #include "backend/opencl/execution/image/CommonExecution.hpp"
+#include "backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp"
+
 namespace MNN {
 namespace OpenCL {
 
@@ -82,6 +84,10 @@ class ConvBufExecution : public ConvBufCommonExecution, public CommonExecution {
     std::vector<uint32_t> mPostGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mPostLocalWorkSize{1, 1, 1, 1};
     const float* mFilterDataPtr = nullptr;
+private:
+
+    std::shared_ptr<StrassenMatrixComputor> mStrassenComputor;
+
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
index 36a4afd24..c932c0a6c 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
@@ -9,10 +9,9 @@
 // #define LOG_VERBOSE
 namespace MNN {
 namespace OpenCL {
-
 // set mDequantScale mDequantOffset mNumQuantBit mFilterDataPtr from mConv2dParams
 void ConvBufLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
-    quanCommon = ConvolutionCommon::load(mResource->mConv2dParams, this->backend(), false, true);
+    quanCommon = ConvolutionCommon::load(mOp, this->backend(), false, true);
     if (mResource->mConv2dParams->quanParameter() != nullptr) {
         mLowMemoryFlag = true;
     } else {
@@ -23,6 +22,7 @@ void ConvBufLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<Convoluti
     // set mResource->mNumQuantBit
     if(quanCommon->canUseInt4){
         mResource->mNumQuantBit = 4;
+        mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     }else{
         mResource->mNumQuantBit = 8;
     }
@@ -93,9 +93,9 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input,
         buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT4");
     } else {/* More types to be supported. */}
     if(mResource->mInputChannel % pack != 0){
-        buildOptions.emplace("-DINPUT_CHANNEL_LEAVE");
+        buildOptions.emplace("-DCHANNEL_LEAVE");
     }
-    
+
     mBufferToConv1x1Kernel = runtime->buildKernelWithCache("buffer_convert_quant", kernelName, buildOptions);
     auto kernel = mBufferToConv1x1Kernel->get();
     uint32_t gws[2] = {static_cast<uint32_t>(UP_DIV(mResource->mInputChannel, pack)), static_cast<uint32_t>(mResource->mOutputChannel)};
@@ -128,7 +128,7 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input,
     res = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
                                                          cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]),
                                                          cl::NDRange(lws[0], lws[1]), nullptr, &event);
-    
+
     event.wait();
     MNN_CHECK_CL_SUCCESS(res, "convertToQuantWeight1x1Buffer");
 
@@ -141,9 +141,15 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input,
 // set mKernelBuffer for the 1x1 kernels
 void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
     cl_int res;
-    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, packCout), ROUND_UP(mResource->mInputChannel, packCin), mResource->mKernelWidth, mResource->mKernelHeight}));
+    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, packCout), ROUND_UP(mResource->mInputChannel, packCin), 1, 1}));
     size_t buffer_size = filterBuffer->usize() / sizeof(float);
-    size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight * sizeof(char);
+    size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel;
+    // shared part for all cases
+    if (mResource->mNumQuantBit == 4){
+        // int4 case
+        buffer_size /= 2;
+        cpy_size = UP_DIV(cpy_size, 2);
+    } else {/* More types to be supported. */}
     float *dequantAlpha = quanCommon->alpha.get();
     cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
     void *mapPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
@@ -154,26 +160,19 @@ void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin,
         MNN_ASSERT(false);
     }
     mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, mapPtr);
-    // shared part for all cases
-    if (mResource->mNumQuantBit == 8) {
-        // int8 case
-        buffer_size *= sizeof(int8_t);
-    } else if (mResource->mNumQuantBit == 4){
-        // int4 case
-        buffer_size /= 2;
-    } else {/* More types to be supported. */}
-    
+
     // Use Image load weights
     if(UP_DIV(mResource->mInputChannel, packCin) <= 16384 && ROUND_UP(mResource->mOutputChannel, packCout) <= 16384){
         mResource->mUseImage = true;
     }
     if(mResource->mUseImage) {
-        if(mResource->mNumQuantBit == 4){
-            packCin *= 2;
-        }
         size_t w = ROUND_UP(mResource->mOutputChannel, packCout);
         size_t h = UP_DIV(mResource->mInputChannel, packCin);
-        mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res));
+        if(mResource->mNumQuantBit == 4){
+            mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT16), w, h, 0, nullptr, &res));
+        }else{
+            mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res));
+        }
         if (nullptr == mResource->mKernelImage.get() || res != CL_SUCCESS) {
             MNN_ERROR("Alloc Image %d x %d error, code:%d \n", (int)w, (int)h, (int)res);
         }
@@ -185,11 +184,13 @@ void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin,
 // set mFilter for the general kernels
 void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
     if (filterDataPtr != nullptr) {
-        std::vector<int> filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)};
-        std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight}));
-        // int buffer_size = filterBuffer->elementSize();
+        std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, 4), mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight}));
         size_t buffer_size = filterBuffer->usize() / sizeof(float);
-        buffer_size *= sizeof(int8_t);
+        size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight;
+        if (mResource->mNumQuantBit == 4){
+            buffer_size /= 2;
+            cpy_size = UP_DIV(cpy_size, 2);
+        }
         cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
         filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
         float *dequantAlpha = quanCommon->alpha.get();
@@ -197,14 +198,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s
         cl_int res;
         auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
         if(ptrCL != nullptr && res == CL_SUCCESS) {
-            ::memset(ptrCL, 0, buffer_size);
-            const int copy_size = mResource->mKernelWidth * mResource->mKernelHeight * sizeof(int8_t);
-            for(int oc=0; oc<mResource->mOutputChannel; oc++) {
-                int ic = 0;
-                for(; ic<mResource->mInputChannel; ic++) {
-                    ::memcpy((int8_t *)ptrCL + (oc * ROUND_UP(mResource->mInputChannel, 4) + ic) * mResource->mKernelWidth * mResource->mKernelHeight, ((int8_t *)filterDataPtr) + (oc * mResource->mInputChannel + ic) * mResource->mKernelWidth * mResource->mKernelHeight, copy_size);
-                }
-            }
+                ::memcpy(ptrCL, filterDataPtr, cpy_size);
         } else {
             MNN_ERROR("setGeneralWeightLowMemory: Map error ptrCL == nullptr \n");
         }
@@ -212,7 +206,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s
         // convert to NC4HW4
         if (mResource->mNumQuantBit == 8) {
             // ROUND_UP(IC, 4), UP_DIV(OC, 4) * mKernelWidth * mKernelHeight
-            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
+            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 4 * ROUND_UP(mResource->mInputChannel, 4)}));
             mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC);
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
             // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight}
@@ -222,7 +216,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s
             // For int4 case, data stored in mFilter should be uint8_t,
             // while "Tensor::createDevice<uint8_t>" occupies more memory than "Tensor::createDevice<int8_t>".
             // Therefore, we use "Tensor::createDevice<int8_t>" currently, leaving "Tensor::createDevice<uint8_t>" to be supported.
-            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, filterImageShape[1], 1, 2 * filterImageShape[0]}));
+            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 2 * ROUND_UP(mResource->mInputChannel, 4)}));
             mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC);
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
             // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight}
@@ -352,8 +346,9 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     return;
 }
-void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output) {
+unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output) {
     auto &unit = mUnits[0];
+    unsigned int total_time = 0;
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
     const int outChannel = outputShape.at(3);
@@ -379,7 +374,7 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu
     if(width == 1 && height == 1){
         buildOption.emplace("-DWIDTH_HEIGHT_1");
     }
-    
+
     if(blockDim % 16 != 0){
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     } else if (mResource->mUseImage && mResource->mNumQuantBit == 4 && blockDim % 32 != 0) {
@@ -401,7 +396,7 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu
     for (; knl_idx < actual_kernel; knl_idx++) {
         kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[knl_idx], buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-        
+
         globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx]) * width), static_cast<uint32_t>(global_y)};
         uint32_t idx            = 0;
         cl_int ret = CL_SUCCESS;
@@ -433,10 +428,11 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu
             mLocalWorkSize = {retTune.first[0], retTune.first[1]};
         }
     }
+    total_time += min_cost.first;
     int min_index  = min_cost.second;
     mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-    
-    
+
+
     unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[min_index], buildOption);
     //MNN_PRINT("Kernel is %d.\n", min_index);
     uint32_t idx = 0;
@@ -464,10 +460,11 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu
     mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
-    return;
+    return total_time;
 }
-void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * output) {
+unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * output) {
     mUnits.resize(3);
+    unsigned int total_time = 0;
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
     const int outChannel = outputShape.at(3);
@@ -478,16 +475,16 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
     const int outputChannelBlocks = UP_DIV(outChannel, 4);
     const int blockNum = mResource->mBlockSize;
     const int blockDim = mResource->mInputChannel / mResource->mBlockSize;
-    
+
     int global_y = UP_DIV(batch, 4) * width_height;
-    const int total_kernel = 5;
-    std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf", "gemm_b4_c1_image",  "gemm_b4_c2_image"};
-    int itemC[total_kernel] = {1, 2, 4, 1, 2};
+    const int total_kernel = 6;
+    std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf", "gemm_b4_c1_image",  "gemm_b4_c2_image", "gemm_b4_c4_image"};
+    int itemC[total_kernel] = {1, 2, 4, 1, 2, 4};
     int actual_kernel = total_kernel;
     std::shared_ptr<KernelWrap> kernel[total_kernel];
     std::vector<uint32_t> globalWorkSize[total_kernel];
     std::vector<uint32_t> localWorkSize[total_kernel];
-    std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
+    std::pair<unsigned int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
     std::set<std::string> buildOption = mResource->mBuildOptions;
     if(blockDim % 16 != 0){
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
@@ -510,7 +507,7 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
         mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(mResource->mInputChannel, 4)), static_cast<uint32_t>(UP_DIV(batch, 4)), static_cast<uint32_t>(width_height)};
         unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", "reshape_nchw4_nhwc4", buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
-        
+
         uint32_t idx = 0;
         cl_int ret = CL_SUCCESS;
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -523,7 +520,9 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannels));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
         MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_nc4_cn4");
-        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel).first;
+        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel);
+        total_time += retTune.second;
+        mLocalWorkSize = retTune.first;
         mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
         unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
@@ -540,7 +539,7 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
         for (; knl_idx < actual_kernel; knl_idx++) {
             kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-            
+
             globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx])), static_cast<uint32_t>(global_y)};
             uint32_t idx            = 0;
             cl_int ret = CL_SUCCESS;
@@ -560,18 +559,19 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
             ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockNum));
             ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockDim));
             MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf Kernel Select");
-            std::pair<std::vector<uint32_t>, int> retTune;
+            std::pair<std::vector<uint32_t>, unsigned int> retTune;
             retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]);
-	    if(min_cost.first > retTune.second) {
+            if(min_cost.first > retTune.second) {
                 min_cost.first = retTune.second;
                 min_cost.second = knl_idx;
                 mLocalWorkSize = {retTune.first[0], retTune.first[1]};
             }
         }
+        total_time += min_cost.first;
         int min_index  = min_cost.second;
         mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-        
-        
+
+
         unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[min_index], buildOption);
         //MNN_PRINT("Kernel is %d.\n", min_index);
         uint32_t idx = 0;
@@ -613,12 +613,14 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor *
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
         MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_cn4_nc4");
-        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel).first;
+        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel);
+        mLocalWorkSize = retTune.first;
+        total_time += retTune.second;
         mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
         unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
     }
-    return;
+    return total_time;
 }
 ConvBufLowMemoryExecution::ConvBufLowMemoryExecution(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend)
     : ConvBufCommonExecution(op->main_as_Convolution2D(), backend), CommonExecution(backend, op) {
@@ -706,9 +708,17 @@ ErrorCode ConvBufLowMemoryExecution::onEncode(const std::vector<Tensor *> &input
     // onclone default use conv1x1Opt, need reset
     std::vector<int> outputShape = tensorShapeFormat(output);
     const int batch = outputShape.at(0);
-    bool isMali = mOpenCLBackend->getOpenCLRuntime()->getGpuType() == MALI;
+    auto runTime = mOpenCLBackend->getOpenCLRuntime();
     if (mResource->mConv1x1Opt) {
-        if(batch > 1 && isMali){
+        if(batch > 1 && false == getPreParamInfo("ConvBufLowMemoryPreArrangeMode", &batchConvMode, runTime)){
+            if(tuneGemvBatchLowMemory(input, output) < tuneGemmLowMemory(input, output)){
+                batchConvMode = 1;
+            } else{
+                batchConvMode = 2;
+            }
+            setPreParamInfo("ConvBufLowMemoryPreArrangeMode", batchConvMode, runTime);
+        }
+        if(batch > 1 && batchConvMode == 1){
             tuneGemvBatchLowMemory(input, output);
         }else{
             tuneGemmLowMemory(input, output);
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
index de0938c7b..8488f461b 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
@@ -28,8 +28,8 @@ class ConvBufLowMemoryExecution : public ConvBufCommonExecution, public CommonEx
     void set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon);
     void setGeneralWeightLowMemory(void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon);
     void tuneGeneralCaseLowMemory(Tensor * input, Tensor * output);
-    void tuneGemmLowMemory(Tensor * input, Tensor * output);
-    void tuneGemvBatchLowMemory(Tensor * input, Tensor * output);
+    unsigned int tuneGemmLowMemory(Tensor * input, Tensor * output);
+    unsigned int tuneGemvBatchLowMemory(Tensor * input, Tensor * output);
     bool convertToQuantWeight1x1Buffer(cl::Buffer input, int pack);
     std::vector<int> mPaddings{0, 0};
     std::vector<uint32_t> mGlobalWorkSize{1, 1, 1};
@@ -39,6 +39,7 @@ class ConvBufLowMemoryExecution : public ConvBufCommonExecution, public CommonEx
     std::shared_ptr<Tensor> mConvGemmInpTensor;
     std::shared_ptr<Tensor> mConvGemmOutTensor;
     std::shared_ptr<KernelWrap> mBufferToConv1x1Kernel = nullptr;
+    uint32_t batchConvMode = 0; // batch > 1 convolution input arrage mode. 0 is need tune; 1 arrage to n/4chw4; 2 arrage to c/4hwn4
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
index 17ca12a3e..c7b7fc644 100644
--- a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
@@ -85,7 +85,7 @@ ConvBufWinograd::ConvBufWinograd(const MNN::Op* op, Backend* backend) : CommonEx
     int weightSize             = 0;
     const float* filterDataPtr = nullptr;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2D, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);
 
     mCo     = mResource->mCommon->outputCount();
     mCi     = weightSize / mCo / mResource->mCommon->kernelX() / mResource->mCommon->kernelY();
diff --git a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
index 142c49268..70685e7bb 100644
--- a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
@@ -110,7 +110,7 @@ ConvSubgroupBuf::ConvSubgroupBuf(const std::vector<Tensor *> &inputs, const std:
         const float *FilterDataPtr = NULL;
         int weightSize = 0;
         std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-        ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &FilterDataPtr, &weightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &FilterDataPtr, &weightSize);
         if (FilterDataPtr != nullptr) {
             std::shared_ptr<Tensor> sourceWeight(
                 Tensor::create<float>(std::vector<int>{mResource->mOutputChannel, mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight},
@@ -149,7 +149,7 @@ ConvSubgroupBuf::ConvSubgroupBuf(const std::vector<Tensor *> &inputs, const std:
 
             queue.enqueueUnmapMemObject(weightBuffer, weight_ptr);
         }
-    } 
+    }
     {
         int biasSize    = conv2dParams->common()->outputCount();
         int buffer_size = ROUND_UP(biasSize, 16); // pack to 16
@@ -265,7 +265,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     const int inputHeight   = inputShape.at(1);
     const int inputWidth    = inputShape.at(2);
     const int inputChannels = inputShape.at(3);
-    
+
     int input_width_pad = mResource->mStrides[1] * (8 - 1) + (mResource->mKernelWidth - 1) * mResource->mDilations[1] + 1 + width * mResource->mStrides[1] + mPaddings[1];
     int input_height_pad = inputHeight + 2 * mPaddings[0];
     uint32_t MaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->MaxWorkGroupSize());
@@ -285,7 +285,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     uint32_t sub_group_size = 16;
     uint32_t slm_div_factor = tune_param.second;
     uint32_t work_group_size = sub_group_size * slm_div_factor;
-    uint32_t feature_block_size = 16;        
+    uint32_t feature_block_size = 16;
     uint32_t input_line_size = strideShape[1] * (blockWidth - 1) + (kernelShape[1] - 1) * dilationShape[1] + 1;
     uint32_t input_block_size = UP_DIV(input_line_size * kernelShape[0] * dilationShape[0], sub_group_size);
     uint32_t x_blocks = UP_DIV(outputImageShape[1], blockWidth);
@@ -303,9 +303,9 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
              mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
              unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("input_transe_buf", "conv_transe_c4_c1", {});
-             
+
              uint32_t mMaxWGS_S = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
-             
+
              mTranseGlobalWorkSize = {static_cast<uint32_t>(inputWidth * inputHeight),
                                       static_cast<uint32_t>(UP_DIV(inputShape.at(3), 4)),
                                       static_cast<uint32_t>(inputShape.at(0))};
@@ -321,7 +321,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
-             
+
              mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c1", unit.kernel).first;
              mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize);
          } else {
@@ -329,9 +329,9 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
              mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
              unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("input_transe_buf", "conv_transe_c4_c16", {});
-             
+
              uint32_t mMaxWGS_S = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
-             
+
              mTranseGlobalWorkSize = {static_cast<uint32_t>(inputWidth * inputHeight),
                                       static_cast<uint32_t>(UP_DIV(inputShape.at(3), 4)),
                                       static_cast<uint32_t>(inputShape.at(0))};
@@ -347,7 +347,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
-             
+
              mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c16", unit.kernel).first;
              mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize);
          }
@@ -355,7 +355,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
         unit.localWorkSize = {mTranseLocalWorkSize[0], mTranseLocalWorkSize[1], mTranseLocalWorkSize[2]};
         mUnits.emplace_back(unit);
     }
-    
+
     Unit unit;
     if (inputChannels < 16 && in_c_pack == 4) {
          std::set<std::string> buildOptions = mResource->mBuildOptions;
@@ -410,7 +410,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ConvSubgroupBuf onResize !\n");
 #endif
-    
+
     mOpenCLBackend->recordKernel3d(unit.kernel , mGlobalWorkSize, mLocalWorkSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
diff --git a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
index 4f1b990b7..096594ebc 100644
--- a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
@@ -35,7 +35,7 @@ DeconvBufExecution::DeconvBufExecution(const std::vector<Tensor *> &inputs, cons
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);
 
     int inputChannel  = weightSize / (kernelWidth * kernelHeight * outputChannel);
     std::vector<int> filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth};
diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
index 4e541fc4c..5bc18f9ff 100644
--- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
@@ -34,7 +34,7 @@ DepthwiseConvBufExecution::DepthwiseConvBufExecution(const std::vector<Tensor *>
     const float* filterDataPtr = nullptr;
     int filterDataSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize);
 
     mResource->mFilter.reset(Tensor::createDevice<float>({1, ROUND_UP(filterImageShape[1], 2)/*for kernel C8 read*/, 1, 4 * filterImageShape[0]}));
     std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
index 6b62c1286..90bcb5c36 100644
--- a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
@@ -40,7 +40,7 @@ DepthwiseConvSubgroupBufExecution::DepthwiseConvSubgroupBufExecution(const std::
         const float *filterDataPtr = nullptr;
         int filterDataSize         = 0;
         std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-        ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize);
         if (filterDataPtr != nullptr) {
             std::shared_ptr<Tensor> sourceWeight(Tensor::create<float>(
                 std::vector<int>{1, outputChannel, kernelWidth, kernelHeight},
@@ -112,7 +112,7 @@ DepthwiseConvSubgroupBufExecution::DepthwiseConvSubgroupBufExecution(const std::
         }
         mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL);
     }
-    
+
     if (mResource->mConv2dCommonParams->relu() == true) {
         mResource->mBuildOptions.emplace("-DRELU");
     } else if (mResource->mConv2dCommonParams->relu6() == true) {
@@ -178,7 +178,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     auto padding = ConvolutionCommon::convolutionPad(input, output, mResource->mConv2dCommonParams);
     mPaddings[0] = padding.second;//padY
     mPaddings[1] = padding.first;//padX
-    
+
     const int outputHeight = outputShape.at(1);
     const int outputWidth  = outputShape.at(2);
     const int outputChannel  = outputShape.at(3);
@@ -190,7 +190,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     const int inputChannelBlocks = UP_DIV(inputChannels, 4);
     const int filterHeight       = mResource->mConv2dParams->common()->kernelY();
     const int filterWidth        = mResource->mConv2dParams->common()->kernelX();
-    
+
     int inputImageShape[2]  = {inputHeight, inputWidth};
     int outputImageShape[2] = {outputHeight, outputWidth};
     int strideShape[2]      = {mResource->mStrides[0], mResource->mStrides[1]};
@@ -273,7 +273,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputpad.right));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(paddingShape[1]));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(paddingShape[0]));
-    
+
     mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
diff --git a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
index 03589bc6b..92485742b 100644
--- a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
@@ -81,6 +81,9 @@ GroupNormBufExecution::GroupNormBufExecution(const MNN::Op* op, Backend* backend
         } else {
             MNN_ERROR("GroupNorm Beta map error:%d\n", res);
         }
+	
+	mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(gammaBuffer, GammaPtrCL);
+        mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(betaBuffer, BetaPtrCL);
     }
 }
 
diff --git a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
index 0d0e645db..ea055eb37 100644
--- a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
@@ -58,7 +58,7 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     bool canUseTile = (M % tileM == 0) && \
         (N % tileN == 0) && \
         (K % tileK == 0);
-    bool canUseLargeTile = canUseTile && mTransposeA && !mTransposeB && inputs.size() == 2;
+    bool canUseLargeTile = canUseTile && mTransposeA && !mTransposeB;
     if (!canUseLargeTile) {
         // set small tile
         tileM = 64;
@@ -72,16 +72,41 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     
     if(canUseLargeTile) {
         // Match with Large tileM->MWG tileN->NWG tileK->KWG localM->MDIMA localN->NDIMC
-        buildOptions.emplace(" -DGEMMK=0 -DKREG=1 -DKWG=32 -DKWI=2 -DMDIMA=32 -DMDIMC=32 -DMWG=128 -DNDIMB=8 -DNDIMC=8 -DNWG=128 -DSA=0 -DSB=0 -DSTRM=0 -DSTRN=1 -DVWM=2 -DVWN=8 -DOUTPUTMN");
+        uint32_t layout = 4;
+        uint32_t batch = 1;
+        std::vector<uint32_t> param;
+        if(inputs.size() == 2) {
+            param = getGemmParams({(uint32_t)M, (uint32_t)N, (uint32_t)K, layout, batch, (uint32_t)0}, {openCLBuffer(input0), openCLBuffer(input1), openCLBuffer(output)}, mOpenCLBackend->getOpenCLRuntime());
+        } else {
+            param = getGemmParams({(uint32_t)M, (uint32_t)N, (uint32_t)K, layout, batch, (uint32_t)1}, {openCLBuffer(input0), openCLBuffer(input1), openCLBuffer(output), openCLBuffer(inputs[2])}, mOpenCLBackend->getOpenCLRuntime());
+        }
+        int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+        buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+        buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+        buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+        buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+        buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+        buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+        buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+        buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+        buildOptions.emplace("-DSA=" + std::to_string(SA));
+        buildOptions.emplace("-DSB=" + std::to_string(SB));
+        buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+        buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+        buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+        buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+        if(layout >= 4) {
+            buildOptions.emplace("-DOUTPUTMN");
+        }
+
+        if(inputs.size() > 2) {
+            buildOptions.emplace(" -DBIAS_TYPE=1");
+        }
         if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
             buildOptions.emplace("-DUSE_CL_MAD=1");
             buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
         }
-        if(runtime->isSupportedFP16()){
-            buildOptions.emplace(" -DPRECISION=16");
-        } else {
-            buildOptions.emplace(" -DPRECISION=32");
-        }
+
         unit.kernel       = runtime->buildKernel("matmul_params_buf", "Xgemm", buildOptions);
 
      } else if(canUseTile) {
@@ -117,7 +142,9 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         
         float alpha = 1.0;
         float beta = 0.0f;
-        int offset = 0;
+        int offset[4] = {0, 0, 0, 0};
+        int stride[4] = {M, N, N, N};
+        
         int idx            = 0;
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N));
@@ -131,8 +158,7 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         }
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(idx++, offset);
-        ret |= unit.kernel->get().setArg(idx++, offset);
-        ret |= unit.kernel->get().setArg(idx++, offset);
+        ret |= unit.kernel->get().setArg(idx++, stride);
         
         MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution use large tile opt");
         
diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
new file mode 100644
index 000000000..ff1bddda1
--- /dev/null
+++ b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
@@ -0,0 +1,470 @@
+//
+//  StrassenMatmulComputor.cpp
+//  MNN
+//
+//  Created by MNN on 2024/08/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+#include "backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp"
+#include "core/TensorUtils.hpp"
+//#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+
+namespace MNN {
+namespace OpenCL {
+    
+class AutoMemory {
+public:
+    AutoMemory(int size, OpenCLBackend* backend) {
+        mOpenCLBackend = backend;
+        mTempTensor.reset(Tensor::createDevice<float>({size}));
+        bool res = mOpenCLBackend->onAcquireBuffer(mTempTensor.get(), Backend::DYNAMIC);
+        if (!res) {
+            MNN_ERROR("Strassen out of memory\n");
+        }
+        mAddrPtr = openCLBuffer(mTempTensor.get());
+    }
+    ~ AutoMemory() {
+        mOpenCLBackend->onReleaseBuffer(mTempTensor.get(), Backend::DYNAMIC);
+    }
+    const cl::Buffer& get() const {
+        return mAddrPtr;
+    }
+private:
+    cl::Buffer mAddrPtr;
+    OpenCLBackend* mOpenCLBackend;
+    std::shared_ptr<Tensor> mTempTensor;
+};
+
+StrassenMatrixComputor::StrassenMatrixComputor(Backend* bn, int maxDepth) {
+    mMaxDepth = maxDepth;
+    mOpenCLBackend = static_cast<OpenCLBackend*>(bn);
+    mBytes = (mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16() ? 2 : 4);
+    onReset();
+};
+StrassenMatrixComputor::~StrassenMatrixComputor() {
+    // Do nothing
+}
+    
+ErrorCode StrassenMatrixComputor::_generateCFunction(cl::Buffer ptrC, int offsetC, int elementStrideC, cl::Buffer ptrA, int width, int height, Unit& unit) {
+    std::set<std::string> buildOptions;
+    int vec_h = 1;
+    buildOptions.emplace("-DVEC_H=" + std::to_string(vec_h));
+    unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("strassen_binary_buf", "binary_cfunction_buf", buildOptions);
+    auto maxWorkGroupSize      = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
+
+    std::vector<uint32_t> globalWorkSize =  {(uint32_t)UP_DIV(width, 8), (uint32_t)UP_DIV(height, vec_h)};
+    
+    uint32_t index = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= unit.kernel->get().setArg(index++, globalWorkSize[0]);
+    ret |= unit.kernel->get().setArg(index++, globalWorkSize[1]);
+    ret |= unit.kernel->get().setArg(index++, ptrC);
+    ret |= unit.kernel->get().setArg(index++, offsetC);
+    ret |= unit.kernel->get().setArg(index++, elementStrideC);
+    ret |= unit.kernel->get().setArg(index++, ptrA);
+    ret |= unit.kernel->get().setArg(index++, ptrC);
+    ret |= unit.kernel->get().setArg(index++, width);
+    ret |= unit.kernel->get().setArg(index++, height);
+
+    MNN_CHECK_CL_SUCCESS(ret, "Strassen setArg BinaryCFunctionExecution");
+
+    std::string name = "binary_cfunction_buf";
+    auto localWorkSize = localWS2DDefault(globalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
+    
+    globalWorkSize[0] = ROUND_UP(globalWorkSize[0], std::max((uint32_t)1, localWorkSize[0]));
+    globalWorkSize[1] = ROUND_UP(globalWorkSize[1], std::max((uint32_t)1, localWorkSize[1]));
+    
+    unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]};
+    unit.localWorkSize  = {localWorkSize[0], localWorkSize[1]};
+    mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize);
+    return NO_ERROR;
+    
+}
+
+ErrorCode StrassenMatrixComputor::_generateBinary(cl::Buffer ptrC, cl::Buffer ptrA, cl::Buffer ptrB, int offsetC, int offsetA, int offsetB, int elementStrideC, int elementStrideA, int elementStrideB, int width, int height, bool isAdd, Unit& unit) {
+    std::set<std::string> buildOptions;
+    if(isAdd) {
+        buildOptions.emplace("-DOPERATOR=in0+in1");
+    } else {
+        buildOptions.emplace("-DOPERATOR=in0-in1");
+    }
+    int vec_h = 1;
+    buildOptions.emplace("-DVEC_H=" + std::to_string(vec_h));
+    unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("strassen_binary_buf", "binary_function_buf", buildOptions);
+    auto maxWorkGroupSize      = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
+
+    std::vector<uint32_t> globalWorkSize =  {(uint32_t)UP_DIV(width, 8), (uint32_t)UP_DIV(height, vec_h)};
+    int baseOffset[4] = {offsetA, offsetB, offsetC, 0};
+    int elementStride[4] = {elementStrideA, elementStrideB, elementStrideC, 0};
+
+    uint32_t index = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= unit.kernel->get().setArg(index++, globalWorkSize[0]);
+    ret |= unit.kernel->get().setArg(index++, globalWorkSize[1]);
+    ret |= unit.kernel->get().setArg(index++, ptrA);
+    ret |= unit.kernel->get().setArg(index++, ptrB);
+    ret |= unit.kernel->get().setArg(index++, ptrC);
+    ret |= unit.kernel->get().setArg(index++, baseOffset);
+    ret |= unit.kernel->get().setArg(index++, elementStride);
+
+    MNN_CHECK_CL_SUCCESS(ret, "Strassen setArg BinaryExecution");
+
+    std::string name = "binary_function_buf";
+    auto localWorkSize = localWS2DDefault(globalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
+    
+    globalWorkSize[0] = ROUND_UP(globalWorkSize[0], std::max((uint32_t)1, localWorkSize[0]));
+    globalWorkSize[1] = ROUND_UP(globalWorkSize[1], std::max((uint32_t)1, localWorkSize[1]));
+    
+    unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]};
+    unit.localWorkSize  = {localWorkSize[0], localWorkSize[1]};
+    mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize);
+    return NO_ERROR;
+}
+
+ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int postType, Unit& unit) {
+        
+    std::set<std::string> buildOptions;
+    
+    uint32_t layout = 4;
+    uint32_t batch = 1;
+    
+    std::vector<uint32_t> param;
+    if(COT.stackIndex < 0 || postType == 0) {
+        param = getGemmParams({(uint32_t)e, (uint32_t)h, (uint32_t)l, layout, batch, (uint32_t)0}, {mStack[AT.stackIndex], mStack[BT.stackIndex], mStack[CT.stackIndex]}, mOpenCLBackend->getOpenCLRuntime());
+    } else {
+        param = getGemmParams({(uint32_t)e, (uint32_t)h, (uint32_t)l, layout, batch, (uint32_t)postType}, {mStack[AT.stackIndex], mStack[BT.stackIndex], mStack[CT.stackIndex], mStack[COT.stackIndex]}, mOpenCLBackend->getOpenCLRuntime());
+    }
+    int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+    buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+    buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+    buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+    buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+    buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+    buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+    buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+    buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+    buildOptions.emplace("-DSA=" + std::to_string(SA));
+    buildOptions.emplace("-DSB=" + std::to_string(SB));
+    buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+    buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+    buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+    buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+    if(layout >= 4) {
+        buildOptions.emplace("-DOUTPUTMN");
+    }
+
+    if(postType > 0) {
+        buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string(postType));
+    }
+
+    int tileM = MWG;
+    int tileN = NWG;
+    int localM = MDIMC;
+    int localN = NDIMC;
+    int alignM = e;
+    int alignN = h;
+    int alignK = l;
+    if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
+        buildOptions.emplace("-DUSE_CL_MAD=1");
+        buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
+    }
+
+    unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "Xgemm", buildOptions);
+    
+    int out_per_thread_m = tileM / localM;
+    int out_per_thread_n = tileN / localN;
+    
+    std::vector<uint32_t> globalWorkSize = {static_cast<uint32_t>(alignM/out_per_thread_m), static_cast<uint32_t>(alignN/out_per_thread_n)};
+    std::vector<uint32_t> localWorkSize = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN)};
+    
+    float alpha = 1.0;
+    float beta = 0.0f;
+    // offset_a, offset_b, offset_c, offset_bias
+    int offset[4] = {AT.offsetBytes / mBytes, BT.offsetBytes / mBytes, CT.offsetBytes / mBytes, COT.offsetBytes / mBytes};
+    // stride_a, stride_b, stride_c, stride_bias
+    int stride[4] = {AT.lineStrideBytes / mBytes, BT.lineStrideBytes / mBytes, CT.lineStrideBytes / mBytes, COT.lineStrideBytes / mBytes};
+
+    int idx            = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignM));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignN));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignK));
+    ret |= unit.kernel->get().setArg(idx++, alpha);
+    ret |= unit.kernel->get().setArg(idx++, beta);
+    ret |= unit.kernel->get().setArg(idx++, mStack[AT.stackIndex]);
+    ret |= unit.kernel->get().setArg(idx++, mStack[BT.stackIndex]);
+    if(postType > 0) {
+        ret |= unit.kernel->get().setArg(idx++, mStack[COT.stackIndex]);
+    }
+    ret |= unit.kernel->get().setArg(idx++, mStack[CT.stackIndex]);
+    ret |= unit.kernel->get().setArg(idx++, offset);
+    ret |= unit.kernel->get().setArg(idx++, stride);
+    
+    MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf Strassen Kernel Select");
+    
+    unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]};
+    unit.localWorkSize  = {localWorkSize[0], localWorkSize[1]};
+    mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize);
+
+    return NO_ERROR;
+}
+
+ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int currentDepth, int postType) {
+
+    bool isAligned = (e % 32 == 0 && l % 4 == 0 && h % 32 == 0);
+    bool enoughComputation = (e >= 512 && l >= 512 && h >= 512) && (1.0 * e / 1024 * l / 1024 * h / 1024 >= 4.0);
+    
+    if (currentDepth >= mMaxDepth || !isAligned || !enoughComputation) {// not align or not enough computation
+        Unit unit;
+        auto res = _generateBasicMatMul(e, l, h, AT, BT, CT, COT, postType, unit);
+        mUnits.emplace_back(unit);
+        return res;
+    }
+    int eSub = e / 2;
+    int hSub = h / 2;
+    int lSub = l / 2;
+    
+    // Compute expand the memory read and write cost
+    float AComputeCost = 1.0 * eSub * lSub * 12 * mBytes;// 4 times, 3 matrix each time
+    float BComputeCost = 1.0 * lSub * hSub * 12 * mBytes;// 4 times, 3 matrix each time
+    float CComputeCost = 1.0 * eSub * hSub * (8 + 3 * 2) * mBytes;// 3 times, 8 matrix first time, 3 matrix last two times
+    // Compute save compute time
+    float saveMatMulCost =  1.0 * eSub * lSub * hSub * 2;// 2 for Mul_ADD
+    
+    // devices peak compute value / memory bandwidth
+    const float penalty = 30.0;//FIXME: Find beter way to set it
+    float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * penalty;
+    
+    if (saveCost <= 0.0f) {
+        Unit unit;
+        auto res = _generateBasicMatMul(e, l, h, AT, BT, CT, COT, postType, unit);
+        mUnits.emplace_back(unit);
+        return res;
+    }
+    
+    // Strassen Construct
+    currentDepth += 1;
+    
+    auto maxlH = std::max(lSub, hSub);
+    
+    AutoMemory YAddr(hSub * lSub, mOpenCLBackend);
+    AutoMemory XAddr(maxlH * eSub, mOpenCLBackend);
+
+    MatrixInfo Y;
+    Y.stackIndex = (int)mStack.size();
+    mStack.emplace_back(YAddr.get());
+    Y.offsetBytes = 0;
+    Y.lineStrideBytes = hSub * mBytes;
+    MatrixInfo X;
+    X.stackIndex = (int)mStack.size();
+    X.offsetBytes = 0;
+    X.lineStrideBytes = eSub * mBytes;
+    mStack.emplace_back(XAddr.get());
+    
+    MatrixInfo CX;
+    CX.stackIndex = X.stackIndex;
+    CX.offsetBytes = 0;
+    CX.lineStrideBytes = hSub * mBytes;
+    
+    MatrixInfo a11 = AT;
+    MatrixInfo a12 = AT;
+    a12.offsetBytes = AT.offsetBytes + AT.lineStrideBytes * lSub;
+    MatrixInfo a21 = AT;
+    a21.offsetBytes = AT.offsetBytes + eSub * mBytes;
+    MatrixInfo a22 = AT;
+    a22.offsetBytes = AT.offsetBytes + eSub * mBytes + AT.lineStrideBytes * lSub;
+    
+    MatrixInfo b11 = BT;
+    MatrixInfo b12 = BT;
+    b12.offsetBytes = BT.offsetBytes + hSub * mBytes;
+    MatrixInfo b21 = BT;
+    b21.offsetBytes = BT.offsetBytes + BT.lineStrideBytes * lSub;
+    MatrixInfo b22 = BT;
+    b22.offsetBytes = BT.offsetBytes + BT.lineStrideBytes * lSub + hSub * mBytes;
+    
+    MatrixInfo c11 = CT;
+    MatrixInfo c12 = CT;
+    c12.offsetBytes = CT.offsetBytes + hSub * mBytes;
+    MatrixInfo c21 = CT;
+    c21.offsetBytes = CT.offsetBytes + CT.lineStrideBytes * eSub;
+    MatrixInfo c22 = CT;
+    c22.offsetBytes = CT.offsetBytes + CT.lineStrideBytes * eSub + hSub * mBytes;
+    
+    MatrixInfo Empty;
+    Empty.stackIndex = -1;
+    
+    {
+        // S3=A11-A21, T3=B22-B12, P7=S3*T3
+        {
+            Unit unit;
+            _generateBinary(mStack[X.stackIndex], mStack[a11.stackIndex], mStack[a21.stackIndex], X.offsetBytes/mBytes, a11.offsetBytes/mBytes, a21.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a11.lineStrideBytes/mBytes, a21.lineStrideBytes/mBytes, eSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+        {
+            Unit unit;
+            _generateBinary(mStack[Y.stackIndex], mStack[b22.stackIndex], mStack[b12.stackIndex], Y.offsetBytes/mBytes, b22.offsetBytes/mBytes, b12.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b22.lineStrideBytes/mBytes, b12.lineStrideBytes/mBytes, hSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+
+        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+    }
+    {
+        // S1=A21+A22, T1=B12-B11, P5=S1T1
+        {
+            Unit unit;
+            _generateBinary(mStack[X.stackIndex], mStack[a21.stackIndex], mStack[a22.stackIndex], X.offsetBytes/mBytes, a21.offsetBytes/mBytes, a22.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a21.lineStrideBytes/mBytes, a22.lineStrideBytes/mBytes, eSub, lSub, true, unit);
+            mUnits.emplace_back(unit);
+        }
+        {
+            Unit unit;
+            _generateBinary(mStack[Y.stackIndex], mStack[b12.stackIndex], mStack[b11.stackIndex], Y.offsetBytes/mBytes, b12.offsetBytes/mBytes, b11.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b12.lineStrideBytes/mBytes, b11.lineStrideBytes/mBytes, hSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+  
+        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+    }
+    {
+        // S2=S1-A11, T2=B22-T1, P6=S2T2
+        {
+            Unit unit;
+            _generateBinary(mStack[X.stackIndex], mStack[X.stackIndex], mStack[a11.stackIndex], X.offsetBytes/mBytes, X.offsetBytes/mBytes, a11.offsetBytes/mBytes, X.lineStrideBytes/mBytes, X.lineStrideBytes/mBytes, a11.lineStrideBytes/mBytes, eSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+        {
+            Unit unit;
+            _generateBinary(mStack[Y.stackIndex], mStack[b22.stackIndex], mStack[Y.stackIndex], Y.offsetBytes/mBytes, b22.offsetBytes/mBytes, Y.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b22.lineStrideBytes/mBytes, Y.lineStrideBytes/mBytes, hSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+  
+        auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+    }
+    {
+        // S4=A12-S2, P3=S4*B22, P1=A11*B11
+        {
+            Unit unit;
+            _generateBinary(mStack[X.stackIndex], mStack[a12.stackIndex], mStack[X.stackIndex], X.offsetBytes/mBytes, a12.offsetBytes/mBytes, X.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a12.lineStrideBytes/mBytes, X.lineStrideBytes/mBytes, eSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+
+        auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+        code = _generateMatMul(eSub, lSub, hSub, a11, b11, CX, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+    }
+    {
+        // U2=P1+P6, U3=U2+P7, U4=U2+P5, U7=U3+P5
+        // U5=U4+P3, T4=T2-B21, P4=A22*T4
+        {
+            Unit unit;
+            _generateCFunction(mStack[CT.stackIndex], CT.offsetBytes/mBytes, CT.lineStrideBytes/mBytes, mStack[CX.stackIndex], hSub, eSub, unit);
+            mUnits.emplace_back(unit);
+        }
+
+        {
+            Unit unit;
+            _generateBinary(mStack[Y.stackIndex], mStack[Y.stackIndex], mStack[b21.stackIndex], Y.offsetBytes/mBytes, Y.offsetBytes/mBytes, b21.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, Y.lineStrideBytes/mBytes, b21.lineStrideBytes/mBytes, hSub, lSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+    }
+    {
+        auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, 0);
+        if (code != NO_ERROR) {
+            return code;
+        }
+        // U6=U3-P4, P2=A12*B21, U1=P1+P2
+        {
+            Unit unit;
+            _generateBinary(mStack[c21.stackIndex], mStack[c21.stackIndex], mStack[c11.stackIndex], c21.offsetBytes/mBytes, c21.offsetBytes/mBytes, c11.offsetBytes/mBytes, c21.lineStrideBytes/mBytes, c21.lineStrideBytes/mBytes, c11.lineStrideBytes/mBytes, hSub, eSub, false, unit);
+            mUnits.emplace_back(unit);
+        }
+        
+        {
+            auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, 0);
+            if (code != NO_ERROR) {
+                return code;
+            }
+            Unit unit;
+            _generateBinary(mStack[c11.stackIndex], mStack[c11.stackIndex], mStack[CX.stackIndex], c11.offsetBytes/mBytes, c11.offsetBytes/mBytes, CX.offsetBytes/mBytes, c11.lineStrideBytes/mBytes, c11.lineStrideBytes/mBytes, CX.lineStrideBytes/mBytes, hSub, eSub, true, unit);
+            mUnits.emplace_back(unit);
+        }
+
+    }
+    return NO_ERROR;
+}
+
+void StrassenMatrixComputor::onReset() {
+    mStack.clear();
+    mUnits.clear();
+}
+
+ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const cl::Buffer AT, const cl::Buffer BT, cl::Buffer CT, bool useBias, const cl::Buffer Bias) {
+    mM = e;
+    mN = h;
+    mK = l;
+    MatrixInfo a,b,c,bias;
+    bias.stackIndex = -1;
+    mUnits.clear();
+    mStack = {AT, BT, CT};
+    if (useBias) {
+        bias.stackIndex = 3;
+        bias.offsetBytes = 0;
+        mStack.emplace_back(Bias);
+    }
+    a.stackIndex = 0;
+    a.lineStrideBytes = as * mBytes;
+    a.offsetBytes = 0;
+    
+    b.stackIndex = 1;
+    b.lineStrideBytes = bs * mBytes;
+    b.offsetBytes = 0;
+    
+    c.stackIndex = 2;
+    c.lineStrideBytes = cs * mBytes;
+    c.offsetBytes = 0;
+    return _generateMatMul(e, l, h, a, b, c, bias, 0, useBias);
+}
+
+void StrassenMatrixComputor::onExecute() {
+    // All is done in onResize, just execute it
+    auto res = CL_SUCCESS;
+    int count = 0;
+    for (auto &unit : mUnits) {
+        if(unit.localWorkSize[0] == 0 || unit.localWorkSize[1] == 0) {
+            unit.localWorkSize = cl::NullRange;
+        }
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+        cl::Event event;
+        res = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                cl::NullRange,
+                                                unit.globalWorkSize,
+                                                unit.localWorkSize,
+                                                nullptr,
+                                                &event);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Strassen-" + std::to_string(count++) + "-m" + std::to_string(mM) + "-n" + std::to_string(mN) + "-k" + std::to_string(mK), event});
+#else
+        res = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                                                      cl::NullRange,
+                                                                                      unit.globalWorkSize,
+                                                                                      unit.localWorkSize);
+#endif
+        MNN_CHECK_CL_SUCCESS(res, "Strassen execute");
+    }
+}
+} // namespace MNN
+}
+#endif
diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp
new file mode 100644
index 000000000..dc6a9fa7a
--- /dev/null
+++ b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp
@@ -0,0 +1,67 @@
+//
+//  StrassenMatmulComputor.hpp
+//  MNN
+//
+//  Created by MNN on 2024/08/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+
+#ifndef StrassenMatmulOpenCLComputor_hpp
+#define StrassenMatmulOpenCLComputor_hpp
+
+#include "core/BufferAllocator.hpp"
+#include "core/Backend.hpp"
+#include "backend/opencl/execution/image/CommonExecution.hpp"
+
+namespace MNN {
+namespace OpenCL {
+/**
+ Based on
+ Boyer, B., Dumas, J.-G., Pernet, C., & Zhou, W. (2007). Memory efficient scheduling of Strassen-Winogradʼs matrix multiplication algorithm. Proceedings of the 2009 international symposium on Symbolic and algebraic computation ISSAC 09, 55. ACM Press. Retrieved from http://arxiv.org/abs/0707.2347
+ 
+ Use Table 2
+ */
+class StrassenMatrixComputor {
+public:
+    StrassenMatrixComputor(Backend* bn, int maxDepth);
+    virtual ~StrassenMatrixComputor();
+    
+    ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const cl::Buffer AT, const cl::Buffer BT, cl::Buffer CT, bool useBias, const cl::Buffer Bias);
+    
+    void onExecute();
+    
+    void onReset();
+private:
+    struct MatrixInfo {
+        int stackIndex;
+        int offsetBytes;
+        int lineStrideBytes;
+    };
+    
+    /* postType:
+     0 --> without post process
+     1 --> with bias (one dimension)
+     2 --> with feature map D to eltwise add ( Y = X + D)
+     3 --> with feature map D to eltwise sub ( Y = X - D)
+     4 --> with feature map D to eltwise sub and get negative( Y = D - X)
+     */
+    ErrorCode _generateMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int currentDepth, int postType = 0);
+    ErrorCode _generateBasicMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int postType, Unit& unit);
+    
+    ErrorCode _generateBinary(cl::Buffer ptrC, cl::Buffer ptrA, cl::Buffer ptrB, int offsetC, int offsetA, int offsetB, int elementStrideC, int elementStrideA, int elementStrideB, int width, int height, bool isAdd, Unit& unit);
+
+    ErrorCode _generateCFunction(cl::Buffer ptrC, int offsetC, int elementStrideC, cl::Buffer ptrA, int width, int height, Unit& unit);
+    
+private:
+    std::vector<Unit> mUnits;
+    int mMaxDepth;
+    OpenCLBackend* mOpenCLBackend;
+    int mM, mN, mK;
+    std::vector<cl::Buffer> mStack;
+    int mBytes = 4;
+};
+} // namespace MNN
+}
+#endif /* StrassenMatmulOpenCLComputor_hpp */
+#endif
diff --git a/source/backend/opencl/execution/cl/buffer_convert_quant.cl b/source/backend/opencl/execution/cl/buffer_convert_quant.cl
index 1215cc71b..5043e1418 100644
--- a/source/backend/opencl/execution/cl/buffer_convert_quant.cl
+++ b/source/backend/opencl/execution/cl/buffer_convert_quant.cl
@@ -20,18 +20,18 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS
                                             __global char *output) {
     int image_width_idx  = get_global_id(0); // ic
     int image_height_idx = get_global_id(1); // oc/4 h w
- 
+
     DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
- 
+
     const int input_channel_4_idx  = image_width_idx;
     const int output_channel_4_idx = (image_height_idx / height_width_size) * 4;
     const int height_width_idx     = image_height_idx % height_width_size;
     const int buffer_height_idx    = height_width_idx / kernel_shape.y;
     const int buffer_width_idx     = height_width_idx % kernel_shape.y;
- 
+
     const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
                               buffer_height_idx * kernel_shape.y + buffer_width_idx;
- 
+
     char4 output_values = 0;
     if (output_channel_4_idx < output_channel) {
         const int remain_channel = output_channel - output_channel_4_idx;
@@ -51,7 +51,7 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS
             output_values.y = (char)(*(input_ptr + offset));
             offset += ic_h_w_size;
             output_values.z = (char)(*(input_ptr + offset));
- 
+
         } else if (remain_channel == 2) {
             int offset      = buffer_offset;
             output_values.x = (char)(*(input_ptr + offset));
@@ -70,7 +70,7 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS
 #ifdef USE_LOW_BIT_WEIGHT_INT4
 // convert kernel : from int8 buffer(oihw) to int4 image(oc/4 h w , ic oc4)
 __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS
-                                            __global const char *input_ptr,
+                                            __global const uchar *input_ptr,
                                             __private const int output_channel,
                                             __private const int2 kernel_shape,
                                             __private const int ic_h_w_size,
@@ -78,53 +78,26 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS
                                             __global uchar *output) {
     int image_width_idx  = get_global_id(0); // ic
     int image_height_idx = get_global_id(1); // oc/4 h w
- 
+
     DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
- 
+
     const int input_channel_4_idx  = image_width_idx;
     const int output_channel_4_idx = (image_height_idx / height_width_size) * 4;
     const int height_width_idx     = image_height_idx % height_width_size;
     const int buffer_height_idx    = height_width_idx / kernel_shape.y;
     const int buffer_width_idx     = height_width_idx % kernel_shape.y;
- 
-    const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size +
-                              buffer_height_idx * kernel_shape.y + buffer_width_idx;
- 
-    char4 output_values_int8 = 0;
-    if (output_channel_4_idx < output_channel) {
-        const int remain_channel = output_channel - output_channel_4_idx;
-        if (remain_channel >= 4) {
-            int offset      = buffer_offset;
-            output_values_int8.x = (char)(*(input_ptr + offset));
-            offset          = mad24(1, ic_h_w_size, offset);
-            output_values_int8.y = (char)(*(input_ptr + offset));
-            offset += ic_h_w_size;
-            output_values_int8.z = (char)(*(input_ptr + offset));
-            offset += ic_h_w_size;
-            output_values_int8.w = (char)(*(input_ptr + offset));
-        } else if (remain_channel == 3) {
-            int offset      = buffer_offset;
-            output_values_int8.x = (char)(*(input_ptr + offset));
-            offset          = mad24(1, ic_h_w_size, offset);
-            output_values_int8.y = (char)(*(input_ptr + offset));
-            offset += ic_h_w_size;
-            output_values_int8.z = (char)(*(input_ptr + offset));
- 
-        } else if (remain_channel == 2) {
-            int offset      = buffer_offset;
-            output_values_int8.x = (char)(*(input_ptr + offset));
-            offset          = mad24(1, ic_h_w_size, offset);
-            output_values_int8.y = (char)(*(input_ptr + offset));
-        } else if (remain_channel == 1) {
-            int offset      = buffer_offset;
-            output_values_int8.x = (char)(*(input_ptr + offset));
-        }
-    }
- 
+
+    const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size + buffer_height_idx * kernel_shape.y + buffer_width_idx;
+    int index0 = buffer_offset, index1 = buffer_offset + ic_h_w_size, index2 = buffer_offset + 2 * ic_h_w_size, index3 = buffer_offset + 3 * ic_h_w_size;
+
     uchar2 output_values_int4 = (uchar2)(0, 0);
-    output_values_int4.s0 = (output_values_int8.x + 8) * 16 + (output_values_int8.y + 8);
-    output_values_int4.s1 = (output_values_int8.z + 8) * 16 + (output_values_int8.w + 8);
- 
+    uchar s0 = input_ptr[index0/2];
+    uchar s1 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index1/2];
+    uchar s2 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index2/2];
+    uchar s3 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index3/2];
+    output_values_int4.x = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));
+    output_values_int4.y = ((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));
+
     const int out_offset = (image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*2;
     vstore2(output_values_int4, 0, output+out_offset);
 }
@@ -137,106 +110,134 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS
     a = (uchar16)(((b.s0 + 8) << 4) + b.s1 + 8, ((b.s2 + 8) << 4) + b.s3 + 8, ((b.s4 + 8) << 4) + b.s5 + 8, ((b.s6 + 8) << 4) + b.s7 + 8, ((b.s8 + 8) << 4) + b.s9 + 8, ((b.sa + 8) << 4) + b.sb + 8, ((b.sc + 8) << 4) + b.sd + 8, ((b.se + 8) << 4) + b.sf + 8,  \
                   ((c.s0 + 8) << 4) + c.s1 + 8, ((c.s2 + 8) << 4) + c.s3 + 8, ((c.s4 + 8) << 4) + c.s5 + 8, ((c.s6 + 8) << 4) + c.s7 + 8, ((c.s8 + 8) << 4) + c.s9 + 8, ((c.sa + 8) << 4) + c.sb + 8, ((c.sc + 8) << 4) + c.sd + 8, ((c.se + 8) << 4) + c.sf + 8);
 __kernel void conv2d_1x1_weight_quant_buffer(GLOBAL_SIZE_2_DIMS
-                                            __global const char *input_ptr,
 #ifdef USE_LOW_BIT_WEIGHT_INT4
-                                            __global uchar *output_ptr,
+                                            __global const uchar *input_ptr,
 #else
-                                            __global char *output_ptr,
+                                            __global const char *input_ptr,
 #endif
+                                            __global char *output_ptr,
                                             __private const int input_channel,
                                             __private const int output_channel) {
     int x  = get_global_id(0); // ic / 16
     int y = get_global_id(1); // oc
- 
+
     DEAL_NON_UNIFORM_DIM2(x, y);
     const int xin = x << 4;
     const int outputChannelC4 = (output_channel + 3) >> 2;
-    const int inputOffset = y * input_channel + xin;
-    char16 weight = 0;
-#ifdef INPUT_CHANNEL_LEAVE
-    if(xin + 15 >= input_channel){
-        char *weight_ptr = (char*)&weight;
-        for(int i = 0, j = 0; xin + i < input_channel && j < 16; ++i, ++j){
-            weight_ptr[j] = input_ptr[inputOffset + i];
-        }
-    }else {
-        weight = vload16(0, input_ptr + inputOffset);
+#ifdef USE_LOW_BIT_WEIGHT_INT4
+    const int outputOffset = ((x * outputChannelC4 * 4 * 8 + y * 8));
+#ifdef CHANNEL_LEAVE
+    for(int i = 0; i < 8; ++i){
+        int index0 = y * input_channel + xin + i * 2;
+        int index1 = y * input_channel + xin + i * 2 + 1;
+        uchar s0 = input_ptr[index0/2];
+        uchar s1 = input_ptr[index1/2];
+        output_ptr[outputOffset + i] = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));
     }
 #else
-    weight = vload16(0, input_ptr + inputOffset);
+    const int inputOffset = (y * input_channel + xin)/2;
+    vstore8(convert_char8(vload8(0,input_ptr+inputOffset)),0,output_ptr+outputOffset);
 #endif
-    
-#ifdef USE_LOW_BIT_WEIGHT_INT4
-    const int outputOffset = ((x * outputChannelC4 * 4 * 8 + y * 8));
-    uchar8 outWeight;
-    CHAR16_TO_UCHAR8(outWeight, weight);
-    vstore8(outWeight, 0, output_ptr + outputOffset);
 #else
+    const int inputOffset = y * input_channel + xin;
     const int outputOffset = (x * outputChannelC4 * 4 + y) << 4;
-    vstore16(weight, 0, output_ptr + outputOffset);
+    vstore16(convert_char16(vload16(0, input_ptr + inputOffset)), 0, output_ptr + outputOffset);
 #endif
 }
 
 __kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS
-                                            __global const char *input_ptr,
+#ifdef USE_LOW_BIT_WEIGHT_INT4
+                                            __global const uchar *input_ptr,
+#else
+                                            __global const uchar *input_ptr,
+#endif
                                             __write_only image2d_t output,
                                             __private const int input_channel,
                                             __private const int output_channel) {
-    
-#ifdef USE_LOW_BIT_WEIGHT_INT4
-    int x  = get_global_id(0); // ic / 32
+
+    int x  = get_global_id(0); // ic / 16
     int y = get_global_id(1); // oc
- 
+
     DEAL_NON_UNIFORM_DIM2(x, y);
-    const int outputChannelC4 = (output_channel + 3) >> 2;
-    const int xin = x << 5;
-    const int inputOffset = y * input_channel + xin;
-    char16 weight00 = 0, weight01 = 0;
-#ifdef INPUT_CHANNEL_LEAVE
-    if(xin + 31 >= input_channel){
-        char *weight00_ptr = (char*)&weight00;
-        char *weight01_ptr = (char*)&weight01;
-        int i = 0;
-        for(int j = 0; xin + i < input_channel && j < 16; ++i, ++j){
-            weight00_ptr[j] = input_ptr[inputOffset + i];
-        }
-        for(int j = 0; xin + i < input_channel && j < 16; ++i, ++j){
-            weight01_ptr[j] = input_ptr[inputOffset + i];
-        }
-    }else {
-        weight00 = vload16(0, input_ptr + inputOffset);
-        weight01 = vload16(0, input_ptr + inputOffset + 16);
+    const int xin = x << 4;
+#ifdef USE_LOW_BIT_WEIGHT_INT4
+#ifdef CHANNEL_LEAVE
+    uchar8 out = 0;
+    uchar *out_ptr = (uchar*)&out;
+    for(int i = 0; i < 8; ++i){
+        int index0 = y * input_channel + xin + i * 2;
+        int index1 = y * input_channel + xin + i * 2 + 1;
+        uchar s0 = input_ptr[index0/2];
+        uchar s1 = input_ptr[index1/2];
+        out_ptr[i] = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));
     }
+    write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(out)));
 #else
-    weight00 = vload16(0, input_ptr + inputOffset);
-    weight01 = vload16(0, input_ptr + inputOffset + 16);
+    const int inputOffset = (y * input_channel + xin)/2;
+    write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(vload8(0, input_ptr + inputOffset))));
 #endif
-    
-    uchar16 outWeight;
-    CHAR32_TO_UCHAR16(outWeight, weight00, weight01);
-    write_imagei(output, (int2)(y, x), as_int4(outWeight));
 #else
-    int x  = get_global_id(0); // ic / 16
-    int y = get_global_id(1); // oc
- 
-    DEAL_NON_UNIFORM_DIM2(x, y);
-    const int xin = x << 4;
     const int inputOffset = y * input_channel + xin;
-    const int outputChannelC4 = (output_channel + 3) >> 2;
-    char16 weight = 0;
-#ifdef INPUT_CHANNEL_LEAVE
-    if(xin + 15 >= input_channel){
-        char *weight_ptr = (char*)&weight;
-        for(int i = 0, j = 0; xin + i < input_channel && j < 16; ++i, ++j){
-            weight_ptr[j] = input_ptr[inputOffset + i];
+    write_imagei(output, (int2)(y, x), as_int4(vload16(0, input_ptr + inputOffset)));
+#endif
+}
+
+__kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS
+#ifdef USE_LOW_BIT_WEIGHT_INT4
+                                            __global const uchar *input_ptr,
+                                            __global uchar *output_ptr, //(Ci/packCin， Co/packCout, packCin， packCout)
+#else
+                                            __global const char *input_ptr,
+                                            __global char *output_ptr, //(Ci/packCin， Co/packCout, packCin， packCout)
+#endif
+                                            __private const int input_channel,
+                                            __private const int output_channel,
+                                            __private const int icPack,
+                                            __private const int ocPack) {
+    int x  = get_global_id(0); // ic / icPack
+    int y = get_global_id(1); // oc / ocPack
+
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int xin = x * icPack;
+    const int yin = y * ocPack;
+    const int inputChannelC4 = (input_channel + icPack - 1) / icPack;
+    const int outputChannelC4 = (output_channel + ocPack - 1) / ocPack;
+#ifdef USE_LOW_BIT_WEIGHT_INT4
+    const int inputOffset = (yin * input_channel + xin) / 2;
+    const int outputOffset = ((x * outputChannelC4 + y) * icPack * ocPack) / 2;
+#ifdef CHANNEL_LEAVE
+    for(int i = 0; i < icPack; ++i){
+        for(int j = 0; j < ocPack / 2; ++j){
+            int index0 = (yin + j * 2) * input_channel + xin + i;
+            int index1 = (yin + j * 2 + 1) * input_channel + xin + i;
+            uchar s0 = input_ptr[index0/2];
+            uchar s1 = input_ptr[index1/2];
+            s0 = (index0 % 2) == 0 ? (s0 & 0xf0) : ((s0 & 0x0f) << 4);
+            s1 = (index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f);
+            output_ptr[outputOffset + i * (ocPack / 2) + j] = s0 | s1;
         }
-    }else {
-        weight = vload16(0, input_ptr + inputOffset);
     }
 #else
-    weight = vload16(0, input_ptr + inputOffset);
+    for(int i = 0; i < icPack/2; ++i){
+        for(int j = 0; j < ocPack / 2; ++j){
+            char s0 = input_ptr[inputOffset + (j * 2) * (input_channel / 2) + i];
+            char s1 = input_ptr[inputOffset + (j * 2 + 1) * (input_channel / 2) + i];
+            char d0 = (s0 & 0xf0) | ((s1 & 0xf0) >> 4);
+            char d1 = ((s0 & 0x0f) << 4) | (s1 & 0x0f);
+            output_ptr[outputOffset + (i * 2) * (ocPack / 2) + j] = d0;
+            output_ptr[outputOffset + (i * 2 + 1) * (ocPack / 2) + j] = d1;
+        }
+    }
 #endif
-    
-    write_imagei(output, (int2)(y, x), as_int4(weight));
+#else
+    const int inputOffset = yin * input_channel + xin;
+    const int outputOffset = (x * outputChannelC4 + y) * icPack * ocPack;
+    for(int i = 0; i < icPack; ++i){
+        for(int j = 0; j < ocPack; ++j){
+            output_ptr[outputOffset + i * ocPack + j] = input_ptr[inputOffset + j * input_channel + i];
+        }
+    }
 #endif
 }
+
+
diff --git a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl b/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
index 3fc8fd050..083268503 100644
--- a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
+++ b/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
@@ -481,7 +481,8 @@ __kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2
 #endif
     vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
 }
-__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2
+
+__kernel void gemm_b4_c4_image(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input,
                         __read_only image2d_t weight,
                         __global const float *dequantScaleOffset,
@@ -495,17 +496,18 @@ __kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2
     const int y  = get_global_id(1); //b
     UNIFORM_BOUNDRY_CHECK(x, y);
 
-    const int out_c_idx = x << 1;
+    const int out_c_idx = x << 2;
     const int out_b_idx = y << 2;
         
-    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx));
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(0, bias + out_c_idx));
     COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
     COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
+    COMPUTE_FLOAT4 out2 = (COMPUTE_FLOAT4)bias0.s2;
+    COMPUTE_FLOAT4 out3 = (COMPUTE_FLOAT4)bias0.s3;
 
     int input_offset = out_b_idx * srcChannelC4 * 4;
     int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
     
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
     const int loop = (blockDim + 15) / 16;
     #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
@@ -513,126 +515,208 @@ __kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2
     #else
     const int loop_end = loop;
     #endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-    const int loop = (blockDim + 31) / 32;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*32;
-    #else
-    const int loop_end = loop;
-    #endif
-#endif
     
     for (int i = 0; i < blockNum; i++){
         int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex));
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
+        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + out_c_idx * 2 + kindex));
         for (int j = 0; j < loop_end; j++) {
             int k = i * loop + j;
             int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
             COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5;
+            COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            {
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
+                uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k))));
+                uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k))));
+                char16 charWeights0 = 0;
+                char16 charWeights1 = 0;
+                char16 charWeights2 = 0;
+                char16 charWeights3 = 0;
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
+                UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42);
+                UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5;
+                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7;
+            }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
             COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
+            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
+            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
             #pragma unroll
             for (int i = 0; i < 16; ++i){
                 COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
                 out = mad(in, weights0_ptr[i], out);
                 out1 = mad(in, weights1_ptr[i], out1);
+                out2 = mad(in, weights2_ptr[i], out2);
+                out3 = mad(in, weights3_ptr[i], out3);
             }
         }
 #ifdef INPUT_CHANNEL_LEAVE
         {
             int k = i * loop + loop_end;
             int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
             COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5;
+            COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            {
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
+                uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k))));
+                uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k))));
+                char16 charWeights0 = 0;
+                char16 charWeights1 = 0;
+                char16 charWeights2 = 0;
+                char16 charWeights3 = 0;
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
+                UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42);
+                UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5;
+                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7;
+            }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
             COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
+            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
+            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
             #pragma unroll
             for (int i = 0; i < remain; ++i){
                 COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
                 out = mad(in, weights0_ptr[i], out);
                 out1 = mad(in, weights1_ptr[i], out1);
+                out2 = mad(in, weights2_ptr[i], out2);
+                out3 = mad(in, weights3_ptr[i], out3);
             }
         }
 #endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+    }
+
+#ifdef RELU
+    out = fmax(out, (COMPUTE_FLOAT4)0);
+    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
+    out2 = fmax(out2, (COMPUTE_FLOAT4)0);
+    out3 = fmax(out3, (COMPUTE_FLOAT4)0);
+#endif
+#ifdef RELU6
+    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+#endif
+    vstore4(CONVERT_FLOAT4(out), 0, output + out_offset);
+    vstore4(CONVERT_FLOAT4(out1), 0, output + out_offset + 4);
+    vstore4(CONVERT_FLOAT4(out2), 0, output + out_offset + 8);
+    vstore4(CONVERT_FLOAT4(out3), 0, output + out_offset + 12);
+}
+__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2
+                        __global const FLOAT* input,
+                        __read_only image2d_t weight,
+                        __global const float *dequantScaleOffset,
+                        __global const FLOAT *bias,
+                        __global FLOAT* output,
+                        __private const int dstChannelC4,
+                        __private const int srcChannelC4,
+                        __private const int blockNum,
+                        __private const int blockDim) {
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //b
+    UNIFORM_BOUNDRY_CHECK(x, y);
+
+    const int out_c_idx = x << 1;
+    const int out_b_idx = y << 2;
+        
+    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx));
+    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
+    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
+
+    int input_offset = out_b_idx * srcChannelC4 * 4;
+    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
+    
+    const int loop = (blockDim + 15) / 16;
+    #ifdef INPUT_CHANNEL_LEAVE
+    const int loop_end = max(loop - 1, 0);
+    const int remain = blockDim - loop_end*16;
+    #else
+    const int loop_end = loop;
+    #endif
+    
+    for (int i = 0; i < blockNum; i++){
+        int kindex = i * dstChannelC4 * 4 * 2;
+        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex));
         for (int j = 0; j < loop_end; j++) {
             int k = i * loop + j;
-            int k32 = k << 5;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
+            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
+            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1;
             {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
                 char16 charWeights0 = 0;
                 char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
                 weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
             }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
             COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
             #pragma unroll
             for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
                 out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights2_ptr[i], out1);
-            }
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4));
-                out = mad(in, weights1_ptr[i], out);
-                out1 = mad(in, weights3_ptr[i], out1);
+                out1 = mad(in, weights1_ptr[i], out1);
             }
         }
 #ifdef INPUT_CHANNEL_LEAVE
         {
             int k = i * loop + loop_end;
-            int k32 = k << 5;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
+            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
+            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1;
             {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
                 char16 charWeights0 = 0;
                 char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
                 weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
             }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
             COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
-            for (int i = 0; i < min(16, remain); ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+            #pragma unroll
+            for (int i = 0; i < remain; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
                 out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights2_ptr[i], out1);
-            }
-            for (int i = 16; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
-                out = mad(in, weights1_ptr[i - 16], out);
-                out1 = mad(in, weights3_ptr[i - 16], out1);
+                out1 = mad(in, weights1_ptr[i], out1);
             }
         }
 #endif
-#endif //USE_LOW_BIT_WEIGHT_INT4
     }
 
 #ifdef RELU
@@ -669,7 +753,6 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2
     int input_offset = out_b_idx * srcChannelC4 * 4;
     int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
     
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
     const int loop = (blockDim + 15) / 16;
     #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
@@ -677,24 +760,24 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2
     #else
     const int loop_end = loop;
     #endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-    const int loop = (blockDim + 31) / 32;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*32;
-    #else
-    const int loop_end = loop;
-    #endif
-#endif
 
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
         COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
         for (int j = 0; j < loop_end; j++) {
             int k = i * loop + j;
             int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0;
+            {
+                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                char16 charWeights = 0;
+                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
+            }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
             #pragma unroll
             for (int i = 0; i < 16; ++i){
@@ -706,67 +789,25 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2
         {
             int k = i * loop + loop_end;
             int k16 = k << 4;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            #pragma unroll
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-            }
-        }
-#endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k32 = k << 5;
-            COMPUTE_FLOAT16 weights0, weights1;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0;
             {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                char16 charWeights = 0;
+                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
             }
+            #endif
             COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
             #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-            }
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4));
-                out = mad(in, weights1_ptr[i], out);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k32 = k << 5;
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            for (int i = 0; i < min(16, remain); ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+            for (int i = 0; i < remain; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
                 out = mad(in, weights0_ptr[i], out);
             }
-            for (int i = 16; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
-                out = mad(in, weights1_ptr[i - 16], out);
-            }
         }
 #endif
-#endif //USE_LOW_BIT_WEIGHT_INT4
     }
 
 #ifdef RELU
diff --git a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
index 7b4433111..82b7b02db 100644
--- a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
+++ b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
@@ -851,33 +851,37 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2
     int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx/4) * height + out_h_idx) * width + out_w_idx) * 4 + (out_c_idx % 4);
     int wh = width * height * 4;
 
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
     const int loop = (blockDim + 15) / 16;
     #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
     #else
     const int loop_end = loop;
     #endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-    const int loop = (blockDim + 31) / 32;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    #else
-    const int loop_end = loop;
-    #endif
-#endif
 
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
         COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex));
-        #if (defined USE_LOW_BIT_WEIGHT_INT8)
         for (int j = 0; j < loop_end; j++) {
             int k = i * loop + j;
             #ifndef WIDTH_HEIGHT_1
             int k4 = k << 2;
             #endif
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
             COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1;
+            {
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
+                char16 charWeights0 = 0;
+                char16 charWeights1 = 0;
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+            }
+            #endif
             {
                 COMPUTE_FLOAT16 in;
                 #ifdef WIDTH_HEIGHT_1
@@ -937,8 +941,22 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2
         {
             int k = i * loop + loop_end;
             int k4 = k << 2;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
             COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0, weights1;
+            {
+                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
+                char16 charWeights0 = 0;
+                char16 charWeights1 = 0;
+                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
+                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+            }
+            #endif
             PADZEROS(k, srcChannel, weights0);
             PADZEROS(k, srcChannel, weights1);
             {
@@ -981,215 +999,6 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2
             #endif
         }
         #endif
-        #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k8 = k << 3;
-            #endif
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-            {
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 7) * wh));
-                #endif
-                
-                DOT16X16(in0, weights0, out.s0);
-                DOT16X16(in1, weights1, out.s0);
-                DOT16X16(in0, weights2, out.s1);
-                DOT16X16(in1, weights3, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out1.s0);
-                DOT16X16(in1, weights1, out1.s0);
-                DOT16X16(in0, weights2, out1.s1);
-                DOT16X16(in1, weights3, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out2.s0);
-                DOT16X16(in1, weights1, out2.s0);
-                DOT16X16(in0, weights2, out2.s1);
-                DOT16X16(in1, weights3, out2.s1);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out3.s0);
-                DOT16X16(in1, weights1, out3.s0);
-                DOT16X16(in0, weights2, out3.s1);
-                DOT16X16(in1, weights3, out3.s1);
-            }
-            #endif
-        }
-        #ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k8 = k << 3;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-            PADZEROS(k, srcChannel, weights0);
-            PADZEROS(k + 15, srcChannel, weights1);
-            PADZEROS(k, srcChannel, weights2);
-            PADZEROS(k + 15, srcChannel, weights3);
-            {
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 7) * wh) : (FLOAT4)0);
-                
-                DOT16X16(in0, weights0, out.s0);
-                DOT16X16(in1, weights1, out.s0);
-                DOT16X16(in0, weights2, out.s1);
-                DOT16X16(in1, weights3, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 7) * wh) : (FLOAT4)0);
-                
-                DOT16X16(in0, weights0, out1.s0);
-                DOT16X16(in1, weights1, out1.s0);
-                DOT16X16(in0, weights2, out1.s1);
-                DOT16X16(in1, weights3, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 7) * wh) : (FLOAT4)0);
-                
-                DOT16X16(in0, weights0, out2.s0);
-                DOT16X16(in1, weights1, out2.s0);
-                DOT16X16(in0, weights2, out2.s1);
-                DOT16X16(in1, weights3, out2.s1);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 7) * wh) : (FLOAT4)0);
-                
-                DOT16X16(in0, weights0, out3.s0);
-                DOT16X16(in1, weights1, out3.s0);
-                DOT16X16(in0, weights2, out3.s1);
-                DOT16X16(in1, weights3, out3.s1);
-            }
-            #endif
-        }
-        #endif
-    #endif //USE_LOW_BIT_WEIGHT_INT4
     }
 
 #ifdef RELU
@@ -1281,32 +1090,32 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2
     bool isValidBatch3 = out_b_idx + 3 < batch;
 #endif
 
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
     const int loop = (blockDim + 15) / 16;
     #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
     #else
     const int loop_end = loop;
     #endif
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-    const int loop = (blockDim + 31) / 32;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    #else
-    const int loop_end = loop;
-    #endif
-#endif
     
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
         COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
-        #if (defined USE_LOW_BIT_WEIGHT_INT8)
         for (int j = 0; j < loop_end; j++) {
             int k = i * loop + j;
             #ifndef WIDTH_HEIGHT_1
             int k4 = k << 2;
             #endif
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0;
+            {
+                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                char16 charWeights = 0;
+                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
+            }
+            #endif
             {
                 COMPUTE_FLOAT16 in;
                 #ifdef WIDTH_HEIGHT_1
@@ -1362,7 +1171,18 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2
         {
             int k = i * loop + loop_end;
             int k4 = k << 2;
+            #if (defined USE_LOW_BIT_WEIGHT_INT8)
             COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
+            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            COMPUTE_FLOAT16 weights0;
+            {
+                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
+                char16 charWeights = 0;
+                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
+            }
+            #endif
+            PADZEROS(k, srcChannel, weights0);
             {
                COMPUTE_FLOAT16 in;
                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
@@ -1371,7 +1191,6 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2
                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
                DOT16X16(in, weights0, out);
             }
-            PADZEROS(k, srcChannel, weights0);
             #ifdef BACTH_BLOCK4
             if(isValidBatch1){
                 COMPUTE_FLOAT16 in;
@@ -1400,178 +1219,6 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2
             #endif
         }
         #endif
-        #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k8 = k << 3;
-            #endif
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            {
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out);
-                DOT16X16(in1, weights1, out);
-            }
-        
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out1);
-                DOT16X16(in1, weights1, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out2);
-                DOT16X16(in1, weights1, out2);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in0, in1;
-                #ifdef WIDTH_HEIGHT_1
-                in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32));
-                in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32 + 16));
-                #else
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 1) * wh));
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 2) * wh));
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 3) * wh));
-
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 4) * wh));
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 5) * wh));
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 6) * wh));
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 7) * wh));
-                #endif
-                DOT16X16(in0, weights0, out3);
-                DOT16X16(in1, weights1, out3);
-            }
-            #endif
-        }
-        #ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k8 = k << 3;
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            PADZEROS(k, srcChannel, weights0);
-            PADZEROS(k + 15, srcChannel, weights1);
-            {
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 7) * wh) : (FLOAT4)0);
-                DOT16X16(in0, weights0, out);
-                DOT16X16(in1, weights1, out);
-            }
-        
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 7) * wh) : (FLOAT4)0);
-                DOT16X16(in0, weights0, out1);
-                DOT16X16(in1, weights1, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 7) * wh) : (FLOAT4)0);
-                DOT16X16(in0, weights0, out2);
-                DOT16X16(in1, weights1, out2);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in0, in1;
-                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh));
-                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 1) * wh) : (FLOAT4)0);
-                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 2) * wh) : (FLOAT4)0);
-                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 3) * wh) : (FLOAT4)0);
-                        
-                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 4) * wh) : (FLOAT4)0);
-                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 5) * wh) : (FLOAT4)0);
-                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 6) * wh) : (FLOAT4)0);
-                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 7) * wh) : (FLOAT4)0);
-                DOT16X16(in0, weights0, out3);
-                DOT16X16(in1, weights1, out3);
-            }
-            #endif
-        }
-        #endif
-    #endif //USE_LOW_BIT_WEIGHT_INT4
     }
 
 #ifdef RELU
diff --git a/source/backend/opencl/execution/cl/groupnorm_buf.cl b/source/backend/opencl/execution/cl/groupnorm_buf.cl
index 59e848dff..395ac23d1 100644
--- a/source/backend/opencl/execution/cl/groupnorm_buf.cl
+++ b/source/backend/opencl/execution/cl/groupnorm_buf.cl
@@ -65,8 +65,8 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
                 sum[lid] = sum[lid] + sum[lid + i];
             barrier(CLK_LOCAL_MEM_FENCE);
         }
-        float4 square_sum = sum[0] / (float4)inside;
-        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
+        float4 square_sum = (float4)(sum[0] / inside);
+        float4 value = (float4)(1.0f / sqrt(square_sum.x + epsilon));
 
         for(int i = lid; i < inside_v4; i+=LOCAL_SIZE){
             float4 in0 = convert_float4(vload4(i, input0 + offset));
@@ -102,7 +102,6 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
-        
         float mean = sum[0] / inside;
 
         in_sum = 0;
@@ -173,8 +172,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
             barrier(CLK_LOCAL_MEM_FENCE);
         }
         
-        float4 mean = sum[0] / (float4)inside;
-
+        float4 mean = (float4)(sum[0] / inside);
         in_sum = 0;
         index = lid;
         for(; index < inside_v4 - 1; index+=LOCAL_SIZE){
@@ -203,8 +201,8 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
                 sum[lid] = sum[lid] + sum[lid + i];
             barrier(CLK_LOCAL_MEM_FENCE);
         }
-        float4 square_sum = sum[0] / (float4)inside;
-        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
+        float4 square_sum = (float4)(sum[0] / inside);
+        float4 value = (float4)(1.0f / sqrt(square_sum.x + epsilon));
 
         // The product of W and H is a multiple of 4
         #ifdef WH_4
@@ -220,6 +218,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
             #ifdef SWISH
             out = out * native_recip((float4)1+native_exp(convert_float4(-out)));
             #endif
+
             vstore4(CONVERT_FLOAT4(out), i, output + offset);
         }
         #else
@@ -235,6 +234,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa
             #ifdef SWISH
             out = out * native_recip(1.0+native_exp(-out));
             #endif
+            
             output[offset+i] = (FLOAT)out;
         }
         #endif
diff --git a/source/backend/opencl/execution/cl/matmul_params_buf.cl b/source/backend/opencl/execution/cl/matmul_params_buf.cl
index ce1895d44..c4520fc8e 100644
--- a/source/backend/opencl/execution/cl/matmul_params_buf.cl
+++ b/source/backend/opencl/execution/cl/matmul_params_buf.cl
@@ -77,6 +77,26 @@
   #define USE_CL_MAD 0
 #endif
 
+// BIAS_TYPE
+// 0 -> without bias
+// 1 -> with bias (add) [N]
+// 2 -> with bias (eltwise_add) [M, N]
+// 3 -> with bias (eltwise_sub) [M, N]
+// 4 -> with bias (eltwise_sub and get negative) [M, N]
+#ifndef BIAS_TYPE
+  #define BIAS_TYPE 0
+#endif
+
+#if BIAS_TYPE == 1
+#define DEAL_BIAS(x, a) x = x + a
+#elif BIAS_TYPE == 2
+#define DEAL_BIAS(x, a) x = x + a
+#elif BIAS_TYPE == 3
+#define DEAL_BIAS(x, a) x = x - a
+#elif BIAS_TYPE == 4
+#define DEAL_BIAS(x, a) x = a - x
+#endif
+
 // By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size
 // requirement results in worse performance and is disabled (src/utilities/compile.cpp)
 #ifndef RELAX_WORKGROUP_SIZE
@@ -313,7 +333,7 @@ INLINE_FUNC int GlobalIndexA() {
 }
 
 INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm, const int base, const int _mi,
-                                   const int kSizeM, const int idk) {
+                                   const int astride/*kSizeM*/, const int idk) {
   // Computes the indices based on strided/non-strided access
   #if STRM == 0
     // [MWG/MWI, MWI/VWM, VWM]
@@ -325,7 +345,7 @@ INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm, const
 
   // Loads the data from global memory (not transposed) and stores into registers
   // [kSizeK, kSizeM/VWM, VWM]
-  return agm[idk*(kSizeM/VWM) + idm];
+  return agm[idk*(astride/VWM)+idm];
 }
 
 INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm, const int _mi,
@@ -366,7 +386,7 @@ INLINE_FUNC int GlobalIndexB() {
 }
 
 INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm, const int base, const int _ni,
-                                   const int kSizeN, const int idk) {
+                                   const int bstride/*kSizeN*/, const int idk) {
   // Computes the indices based on strided/non-strided access
   #if STRN == 0
   int idn = base + _ni;
@@ -375,7 +395,7 @@ INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm, const
   #endif
 
   // Loads the data from global memory (transposed) and stores into registers
-  return bgm[idk*(kSizeN/VWN) + idn];
+  return bgm[idk*(bstride/VWN)+idn];
 }
 
 INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm, const int _ni,
@@ -677,11 +697,15 @@ INLINE_FUNC INT2 StoreIndexN() {
 // layout : [M, N]
 INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
                             const INT2 baseOffset,
-                            #ifdef BIAS
-                            realN* epm,
+                            #if BIAS_TYPE > 0
+                                #if BIAS_TYPE > 1
+                                __global realN* egm,
+                                #else
+                                realN* epm,
+                                #endif
                             #endif
                             const int _mi, const int _ni,
-                            const int kSizeN, const real alpha, const real beta) {
+                            const int cstride/*kSizeN*/, const int dstride/*kSizeN*/, const real alpha, const real beta) {
 
   #if STRM == 0
     int idm = _mi + baseOffset.index[0];
@@ -694,8 +718,8 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     int idn = baseOffset.index[1] + _ni*NDIMC;
   #endif
 
-  int index = idm * (kSizeN/VWN) + idn;
-
+  int index = idm * (cstride/VWN) + idn;
+  
   realN result = c_value;
   
   // The final multiplication with alpha (in case beta == 0)
@@ -784,11 +808,17 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
   #endif
   
   
-#ifdef BIAS
-  realN eval = epm[_ni];
-
+#if BIAS_TYPE > 0
+    #if BIAS_TYPE == 1
+    realN eval = epm[_ni];
+    #else
+    
+    int index_bias = idm * (dstride/VWN) + idn;
+    realN eval = egm[index_bias];
+    #endif
+  
   #if VWN == 1
-    result += eval;
+    DEAL_BIAS(result, eval);
     #ifdef RELU
     result = fmax(result, (FLOAT)0);
     #endif
@@ -796,8 +826,8 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     result = clamp(result, (FLOAT)0, (FLOAT)6);
     #endif
   #elif VWN == 2
-    result.x += eval.x;
-    result.y += eval.y;
+    DEAL_BIAS(result.x, eval.x);
+    DEAL_BIAS(result.y, eval.y);
     #ifdef RELU
     result = fmax(result, (FLOAT2)0);
     #endif
@@ -805,10 +835,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     result = clamp(result, (FLOAT2)0, (FLOAT2)6);
     #endif
   #elif VWN == 4
-    result.x += eval.x;
-    result.y += eval.y;
-    result.z += eval.z;
-    result.w += eval.w;
+    DEAL_BIAS(result.x, eval.x);
+    DEAL_BIAS(result.y, eval.y);
+    DEAL_BIAS(result.z, eval.z);
+    DEAL_BIAS(result.w, eval.w);
     #ifdef RELU
     result = fmax(result, (FLOAT4)0);
     #endif
@@ -816,14 +846,14 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     result = clamp(result, (FLOAT4)0, (FLOAT4)6);
     #endif
   #elif VWN == 8
-    result.s0 += eval.s0;
-    result.s1 += eval.s1;
-    result.s2 += eval.s2;
-    result.s3 += eval.s3;
-    result.s4 += eval.s4;
-    result.s5 += eval.s5;
-    result.s6 += eval.s6;
-    result.s7 += eval.s7;
+    DEAL_BIAS(result.s0, eval.s0);
+    DEAL_BIAS(result.s1, eval.s1);
+    DEAL_BIAS(result.s2, eval.s2);
+    DEAL_BIAS(result.s3, eval.s3);
+    DEAL_BIAS(result.s4, eval.s4);
+    DEAL_BIAS(result.s5, eval.s5);
+    DEAL_BIAS(result.s6, eval.s6);
+    DEAL_BIAS(result.s7, eval.s7);
     #ifdef RELU
     result = fmax(result, (FLOAT8)0);
     #endif
@@ -831,22 +861,22 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     result = clamp(result, (FLOAT8)0, (FLOAT8)6);
     #endif
   #elif VWN == 16
-    result.s0 += eval.s0;
-    result.s1 += eval.s1;
-    result.s2 += eval.s2;
-    result.s3 += eval.s3;
-    result.s4 += eval.s4;
-    result.s5 += eval.s5;
-    result.s6 += eval.s6;
-    result.s7 += eval.s7;
-    result.s8 += eval.s8;
-    result.s9 += eval.s9;
-    result.sA += eval.sA;
-    result.sB += eval.sB;
-    result.sC += eval.sC;
-    result.sD += eval.sD;
-    result.sE += eval.sE;
-    result.sF += eval.sF;
+    DEAL_BIAS(result.s0, eval.s0);
+    DEAL_BIAS(result.s1, eval.s1);
+    DEAL_BIAS(result.s2, eval.s2);
+    DEAL_BIAS(result.s3, eval.s3);
+    DEAL_BIAS(result.s4, eval.s4);
+    DEAL_BIAS(result.s5, eval.s5);
+    DEAL_BIAS(result.s6, eval.s6);
+    DEAL_BIAS(result.s7, eval.s7);
+    DEAL_BIAS(result.s8, eval.s8);
+    DEAL_BIAS(result.s9, eval.s9);
+    DEAL_BIAS(result.sA, eval.sA);
+    DEAL_BIAS(result.sB, eval.sB);
+    DEAL_BIAS(result.sC, eval.sC);
+    DEAL_BIAS(result.sD, eval.sD);
+    DEAL_BIAS(result.sE, eval.sE);
+    DEAL_BIAS(result.sF, eval.sF);
     #ifdef RELU
     result = fmax(result, (FLOAT16)0);
     #endif
@@ -861,10 +891,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
 
 
 // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
-INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const int4 stride,
                            const __global realM* restrict agm, const __global realN* restrict bgm,
-                           #ifdef BIAS
-                           const __global realN* restrict egm,
+                           #if BIAS_TYPE > 0
+                           __global realN* restrict egm,
                            #endif
                            __global realM* cgm, const real alpha, const real beta
                            #if SA == 1 && SB == 1
@@ -1076,12 +1106,12 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
             #pragma unroll
             for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
               // Loads data: off-chip --> private (matrix B)
-              bpm[_ni] = GlobalToPrivateOptB(bgm, baseIndexB, _ni, kSizeN, idk);
+              bpm[_ni] = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk);
             }
 
             #pragma unroll
             for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-              const realM aval = GlobalToPrivateOptA(agm, baseIndexA, _mi, kSizeM, idk);
+              const realM aval = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk);
               #pragma unroll
               for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
                 #if VWM == 1
@@ -1135,11 +1165,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
             #pragma unroll
             for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
               // Loads data: off-chip --> private (matrix B)
-              apm[_mi] = GlobalToPrivateOptA(agm, baseIndexA, _mi, kSizeM, idk);
+              apm[_mi] = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk);
             }
             #pragma unroll
             for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
-              const realN bval = GlobalToPrivateOptB(bgm, baseIndexB, _ni, kSizeN, idk);
+              const realN bval = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk);
 
               #pragma unroll
               for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
@@ -1194,7 +1224,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 
   #ifdef OUTPUTMN
       INT2 baseOffset = StoreIndexN();
-      #ifdef BIAS
+    #if BIAS_TYPE == 1
       #pragma promote_to_registers
       realN epm[NWI/VWN]; // MWI * 1
       for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
@@ -1205,17 +1235,22 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
           #endif
           epm[_ni] = egm[idn];
       }
-      #endif
+    #endif
+      
+      
+      
       #pragma unroll
       for (int _mi = 0; _mi < MWI; _mi += 1) {
         #pragma unroll
         for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
           StoreResultsN((__global realN* )cgm, cpn[_mi * (NWI/VWN) + _ni],
               baseOffset,
-              #ifdef BIAS
+              #if BIAS_TYPE > 1
+              (__global realN*)egm,
+              #elif BIAS_TYPE == 1
               (realN*)epm,
               #endif
-              _mi, _ni, kSizeN, alpha, beta);
+              _mi, _ni, stride.s2, stride.s3, alpha, beta);
         }
       }
   
@@ -1246,20 +1281,24 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
            const real_arg arg_beta,
            const __global realM* restrict agm, // [K, M]
            const __global realN* restrict bgm, // [K, N]
-           #ifdef BIAS
-           const __global realN* restrict egm, // [N]
+           #if BIAS_TYPE > 0
+           __global realN* restrict egm, // [N]
            #endif
            __global realM* cgm,
-           const int a_offset, const int b_offset, const int c_offset
+           __private const int4 offset,
+           __private const int4 stride
 ) {
     const real alpha = GetRealArg(arg_alpha);
     const real beta = GetRealArg(arg_beta);
   
     // Adds the offsets (in case of use of a single temporary buffer for A, B, and C)
-    agm = (const __global realM*)((const __global real*)agm + a_offset);
-    bgm = (const __global realN*)((const __global real*)bgm + b_offset);
-    cgm = (__global realM*)((const __global real*)cgm + c_offset);
+    agm = (const __global realM*)((const __global real*)agm + offset.s0);
+    bgm = (const __global realN*)((const __global real*)bgm + offset.s1);
+    cgm = (__global realM*)((__global real*)cgm + offset.s2);
   
+    #if BIAS_TYPE > 0
+    egm = (__global realN*)((__global real*)egm + offset.s3);
+    #endif
     // Allocates workgroup-private memory (local memory)
     #if SA == 1
         __local realM alm[KWG * MWG/VWM];
@@ -1270,26 +1309,26 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
   
     // Computes the matrix-multiplication and stores the result in global memory
     #if SA == 1 && SB == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm,
-          #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
+          #if BIAS_TYPE > 0
           egm,
           #endif
           cgm, alpha, beta, alm, blm);
     #elif SA == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm,
-          #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
+          #if BIAS_TYPE > 0
           egm,
           #endif
           cgm, alpha, beta, alm);
     #elif SB == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm,
-          #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
+          #if BIAS_TYPE > 0
           egm,
           #endif
           cgm, alpha, beta, blm);
     #else
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm,
-          #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
+          #if BIAS_TYPE > 0
           egm,
           #endif
           cgm, alpha, beta);
@@ -1301,15 +1340,21 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
 #else
     __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 #endif
-void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
+void XgemmBatched(const int kSizeM,
+                  const int kSizeN,
+                  const int kSizeK,
                   const real_arg arg_alpha,
                   const real_arg arg_beta,
-                  const __global realM* restrict agm, const int batch_offset_a,
-                  const __global realN* restrict bgm, const int batch_offset_b,
-                  #ifdef BIAS
-                  const __global realN* restrict egm, const int batch_offset_e,
+                  const __global realM* restrict agm,
+                  const int batch_offset_a,
+                  const __global realN* restrict bgm,
+                  const int batch_offset_b,
+                  #if BIAS_TYPE > 0
+                  __global realN* restrict egm,
+                  const int batch_offset_e,
                   #endif
-                  __global realM* cgm, const int batch_offset_c) {
+                  __global realM* cgm,
+                  const int batch_offset_c) {
     const int batch = get_group_id(2);
     const real alpha = GetRealArg(arg_alpha);
     const real beta = GetRealArg(arg_beta);
@@ -1322,9 +1367,9 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
     const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
     __global realM* restrict cgm_ = &cgm[c_offset / VWM];
     
-    #ifdef BIAS
+    #if BIAS_TYPE > 0
     const int e_offset = batch * batch_offset_e;
-    const __global realN* restrict egm_ = &egm[e_offset / VWN];
+    __global realN* restrict egm_ = &egm[e_offset / VWN];
     #endif
   
     // Allocates workgroup-private memory (local memory)
@@ -1334,31 +1379,40 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK,
     #if SB == 1
         __local realN blm[KWG * NWG/VWN];
     #endif
-  
+    int4 stride;
+    stride.s0 = kSizeM;
+    stride.s1 = kSizeN;
+    #ifdef OUTPUTMN
+    stride.s2 = kSizeN;
+    #else
+    stride.s2 = kSizeM;
+    #endif
+    stride.s3 = kSizeN;
     // Computes the matrix-multiplication and stores the result in global memory
     #if SA == 1 && SB == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_,
-        #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
+        #if BIAS_TYPE > 0
         egm_,
         #endif
         cgm_, alpha, beta, alm, blm);
     #elif SA == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_,
-        #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
+        #if BIAS_TYPE > 0
         egm_,
         #endif
         cgm_, alpha, beta, alm);
     #elif SB == 1
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_,
-        #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
+        #if BIAS_TYPE > 0
         egm_,
         #endif
         cgm_, alpha, beta, blm);
     #else
-        XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_,
-        #ifdef BIAS
+        XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
+        #if BIAS_TYPE > 0
         egm_,
         #endif
         cgm_, alpha, beta);
     #endif
 }
+
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index f98f47417..7eebdd550 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -3092,32 +3092,36 @@ const char* gemv_conv1x1_buf =
 " int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
 " int out_offset=(((out_b_idx*dstChannelC4+out_c_idx/4)*height+out_h_idx)*width+out_w_idx)*4+(out_c_idx % 4);\n"
 " int wh=width*height*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " const int loop=(blockDim+15)/16;\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
 " #else\n"
 " const int loop_end=loop;\n"
 " #endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" const int loop=(blockDim+31)/32;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-"#endif\n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
 " COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " for (int j=0; j<loop_end; j++) {\n"
 " int k=i*loop+j;\n"
 " #ifndef WIDTH_HEIGHT_1\n"
 " int k4=k << 2;\n"
 " #endif\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
 " COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
+" {\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
+" char16 charWeights0=0;\n"
+" char16 charWeights1=0;\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" }\n"
+" #endif\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
 " #ifdef WIDTH_HEIGHT_1\n"
@@ -3177,8 +3181,22 @@ const char* gemv_conv1x1_buf =
 " {\n"
 " int k=i*loop+loop_end;\n"
 " int k4=k << 2;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
 " COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
+" {\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
+" char16 charWeights0=0;\n"
+" char16 charWeights1=0;\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" }\n"
+" #endif\n"
 " PADZEROS(k,srcChannel,weights0);\n"
 " PADZEROS(k,srcChannel,weights1);\n"
 " {\n"
@@ -3221,211 +3239,6 @@ const char* gemv_conv1x1_buf =
 " #endif\n"
 " }\n"
 " #endif\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k8=k << 3;\n"
-" #endif\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-" {\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+7)*wh));\n"
-" #endif\n"
-" \n"
-" DOT16X16(in0,weights0,out.s0);\n"
-" DOT16X16(in1,weights1,out.s0);\n"
-" DOT16X16(in0,weights2,out.s1);\n"
-" DOT16X16(in1,weights3,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out1.s0);\n"
-" DOT16X16(in1,weights1,out1.s0);\n"
-" DOT16X16(in0,weights2,out1.s1);\n"
-" DOT16X16(in1,weights3,out1.s1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out2.s0);\n"
-" DOT16X16(in1,weights1,out2.s0);\n"
-" DOT16X16(in0,weights2,out2.s1);\n"
-" DOT16X16(in1,weights3,out2.s1);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out3.s0);\n"
-" DOT16X16(in1,weights1,out3.s0);\n"
-" DOT16X16(in0,weights2,out3.s1);\n"
-" DOT16X16(in1,weights3,out3.s1);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k8=k << 3;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-" PADZEROS(k,srcChannel,weights0);\n"
-" PADZEROS(k+15,srcChannel,weights1);\n"
-" PADZEROS(k,srcChannel,weights2);\n"
-" PADZEROS(k+15,srcChannel,weights3);\n"
-" {\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset+(k8+7)*wh) : (FLOAT4)0);\n"
-" \n"
-" DOT16X16(in0,weights0,out.s0);\n"
-" DOT16X16(in1,weights1,out.s0);\n"
-" DOT16X16(in0,weights2,out.s1);\n"
-" DOT16X16(in1,weights3,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset1+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset1+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset1+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset1+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset1+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset1+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset1+(k8+7)*wh) : (FLOAT4)0);\n"
-" \n"
-" DOT16X16(in0,weights0,out1.s0);\n"
-" DOT16X16(in1,weights1,out1.s0);\n"
-" DOT16X16(in0,weights2,out1.s1);\n"
-" DOT16X16(in1,weights3,out1.s1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset2+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset2+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset2+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset2+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset2+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset2+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset2+(k8+7)*wh) : (FLOAT4)0);\n"
-" \n"
-" DOT16X16(in0,weights0,out2.s0);\n"
-" DOT16X16(in1,weights1,out2.s0);\n"
-" DOT16X16(in0,weights2,out2.s1);\n"
-" DOT16X16(in1,weights3,out2.s1);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset3+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset3+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset3+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset3+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset3+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset3+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset3+(k8+7)*wh) : (FLOAT4)0);\n"
-" \n"
-" DOT16X16(in0,weights0,out3.s0);\n"
-" DOT16X16(in1,weights1,out3.s0);\n"
-" DOT16X16(in0,weights2,out3.s1);\n"
-" DOT16X16(in1,weights3,out3.s1);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #endif\n"
-" #endif //USE_LOW_BIT_WEIGHT_INT4\n"
 " }\n"
 "#ifdef RELU\n"
 " out=fmax(out,(COMPUTE_FLOAT2)0);\n"
@@ -3511,32 +3324,32 @@ const char* gemv_conv1x1_buf =
 " bool isValidBatch2=out_b_idx+2<batch;\n"
 " bool isValidBatch3=out_b_idx+3<batch;\n"
 "#endif\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " const int loop=(blockDim+15)/16;\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
 " #else\n"
 " const int loop_end=loop;\n"
 " #endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" const int loop=(blockDim+31)/32;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-"#endif\n"
 " \n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
 " COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " for (int j=0; j<loop_end; j++) {\n"
 " int k=i*loop+j;\n"
 " #ifndef WIDTH_HEIGHT_1\n"
 " int k4=k << 2;\n"
 " #endif\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0;\n"
+" {\n"
+" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" }\n"
+" #endif\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
 " #ifdef WIDTH_HEIGHT_1\n"
@@ -3592,7 +3405,18 @@ const char* gemv_conv1x1_buf =
 " {\n"
 " int k=i*loop+loop_end;\n"
 " int k4=k << 2;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0;\n"
+" {\n"
+" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" }\n"
+" #endif\n"
+" PADZEROS(k,srcChannel,weights0);\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
 " in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
@@ -3601,7 +3425,6 @@ const char* gemv_conv1x1_buf =
 " in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
 " DOT16X16(in,weights0,out);\n"
 " }\n"
-" PADZEROS(k,srcChannel,weights0);\n"
 " #ifdef BACTH_BLOCK4\n"
 " if(isValidBatch1){\n"
 " COMPUTE_FLOAT16 in;\n"
@@ -3630,174 +3453,6 @@ const char* gemv_conv1x1_buf =
 " #endif\n"
 " }\n"
 " #endif\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k8=k << 3;\n"
-" #endif\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" {\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out);\n"
-" DOT16X16(in1,weights1,out);\n"
-" }\n"
-" \n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out1);\n"
-" DOT16X16(in1,weights1,out1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out2);\n"
-" DOT16X16(in1,weights1,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*32));\n"
-" in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*32+16));\n"
-" #else\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+1)*wh));\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+2)*wh));\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+3)*wh));\n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+4)*wh));\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+5)*wh));\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+6)*wh));\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k8+7)*wh));\n"
-" #endif\n"
-" DOT16X16(in0,weights0,out3);\n"
-" DOT16X16(in1,weights1,out3);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k8=k << 3;\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" PADZEROS(k,srcChannel,weights0);\n"
-" PADZEROS(k+15,srcChannel,weights1);\n"
-" {\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset+(k8+7)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in0,weights0,out);\n"
-" DOT16X16(in1,weights1,out);\n"
-" }\n"
-" \n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset1+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset1+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset1+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset1+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset1+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset1+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset1+(k8+7)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in0,weights0,out1);\n"
-" DOT16X16(in1,weights1,out1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset2+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset2+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset2+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset2+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset2+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset2+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset2+(k8+7)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in0,weights0,out2);\n"
-" DOT16X16(in1,weights1,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in0,in1;\n"
-" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k8*wh));\n"
-" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+input_offset3+(k8+1)*wh) : (FLOAT4)0);\n"
-" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+input_offset3+(k8+2)*wh) : (FLOAT4)0);\n"
-" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+input_offset3+(k8+3)*wh) : (FLOAT4)0);\n"
-" \n"
-" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+input_offset3+(k8+4)*wh) : (FLOAT4)0);\n"
-" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+input_offset3+(k8+5)*wh) : (FLOAT4)0);\n"
-" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+input_offset3+(k8+6)*wh) : (FLOAT4)0);\n"
-" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+input_offset3+(k8+7)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in0,weights0,out3);\n"
-" DOT16X16(in1,weights1,out3);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #endif\n"
-" #endif //USE_LOW_BIT_WEIGHT_INT4\n"
 " }\n"
 "#ifdef RELU\n"
 " out=fmax(out,(COMPUTE_FLOAT)0);\n"
@@ -7427,7 +7082,7 @@ const char* gemm_quant_batch_buf =
 "#endif\n"
 " vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
 "}\n"
-"__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2\n"
+"__kernel void gemm_b4_c4_image(GLOBAL_SIZE_DIM2\n"
 " __global const FLOAT* input,\n"
 " __read_only image2d_t weight,\n"
 " __global const float *dequantScaleOffset,\n"
@@ -7440,16 +7095,17 @@ const char* gemm_quant_batch_buf =
 " const int x=get_global_id(0); //c\n"
 " const int y=get_global_id(1); //b\n"
 " UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x << 1;\n"
+" const int out_c_idx=x << 2;\n"
 " const int out_b_idx=y << 2;\n"
 " \n"
-" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(0,bias+out_c_idx));\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(0,bias+out_c_idx));\n"
 " COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
 " COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
+" COMPUTE_FLOAT4 out2=(COMPUTE_FLOAT4)bias0.s2;\n"
+" COMPUTE_FLOAT4 out3=(COMPUTE_FLOAT4)bias0.s3;\n"
 " int input_offset=out_b_idx*srcChannelC4*4;\n"
 " int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
 " \n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " const int loop=(blockDim+15)/16;\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
@@ -7457,126 +7113,205 @@ const char* gemm_quant_batch_buf =
 " #else\n"
 " const int loop_end=loop;\n"
 " #endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" const int loop=(blockDim+31)/32;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*32;\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-"#endif\n"
 " \n"
 " for (int i=0; i<blockNum; i++){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
 " for (int j=0; j<loop_end; j++) {\n"
 " int k=i*loop+j;\n"
 " int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
 " COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" COMPUTE_FLOAT16 weights2=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+2,k))))*ScaleOffset.s4+ScaleOffset.s5;\n"
+" COMPUTE_FLOAT16 weights3=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+3,k))))*ScaleOffset.s6+ScaleOffset.s7;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" {\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
+" uchar8 charWeightsInt42=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+2,k))));\n"
+" uchar8 charWeightsInt43=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+3,k))));\n"
+" char16 charWeights0=0;\n"
+" char16 charWeights1=0;\n"
+" char16 charWeights2=0;\n"
+" char16 charWeights3=0;\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
+" UCHAR8_TO_CHAR16(charWeights2,charWeightsInt42);\n"
+" UCHAR8_TO_CHAR16(charWeights3,charWeightsInt43);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights2=CONVERT_COMPUTE_FLOAT16(charWeights2)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights3=CONVERT_COMPUTE_FLOAT16(charWeights3)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
 " COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
+" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
+" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
 " #pragma unroll\n"
 " for (int i=0; i<16; ++i){\n"
 " COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
 " out=mad(in,weights0_ptr[i],out);\n"
 " out1=mad(in,weights1_ptr[i],out1);\n"
+" out2=mad(in,weights2_ptr[i],out2);\n"
+" out3=mad(in,weights3_ptr[i],out3);\n"
 " }\n"
 " }\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
 " {\n"
 " int k=i*loop+loop_end;\n"
 " int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
 " COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" COMPUTE_FLOAT16 weights2=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+2,k))))*ScaleOffset.s4+ScaleOffset.s5;\n"
+" COMPUTE_FLOAT16 weights3=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+3,k))))*ScaleOffset.s6+ScaleOffset.s7;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" {\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
+" uchar8 charWeightsInt42=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+2,k))));\n"
+" uchar8 charWeightsInt43=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+3,k))));\n"
+" char16 charWeights0=0;\n"
+" char16 charWeights1=0;\n"
+" char16 charWeights2=0;\n"
+" char16 charWeights3=0;\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
+" UCHAR8_TO_CHAR16(charWeights2,charWeightsInt42);\n"
+" UCHAR8_TO_CHAR16(charWeights3,charWeightsInt43);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights2=CONVERT_COMPUTE_FLOAT16(charWeights2)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights3=CONVERT_COMPUTE_FLOAT16(charWeights3)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
 " COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
+" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
+" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
 " #pragma unroll\n"
 " for (int i=0; i<remain; ++i){\n"
 " COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
 " out=mad(in,weights0_ptr[i],out);\n"
 " out1=mad(in,weights1_ptr[i],out1);\n"
+" out2=mad(in,weights2_ptr[i],out2);\n"
+" out3=mad(in,weights3_ptr[i],out3);\n"
 " }\n"
 " }\n"
 "#endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" }\n"
+"#ifdef RELU\n"
+" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
+" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
+" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
+" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+"#endif\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
+" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset+8);\n"
+" vstore4(CONVERT_FLOAT4(out3),0,output+out_offset+12);\n"
+"}\n"
+"__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2\n"
+" __global const FLOAT* input,\n"
+" __read_only image2d_t weight,\n"
+" __global const float *dequantScaleOffset,\n"
+" __global const FLOAT *bias,\n"
+" __global FLOAT* output,\n"
+" __private const int dstChannelC4,\n"
+" __private const int srcChannelC4,\n"
+" __private const int blockNum,\n"
+" __private const int blockDim) {\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //b\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" const int out_c_idx=x << 1;\n"
+" const int out_b_idx=y << 2;\n"
+" \n"
+" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(0,bias+out_c_idx));\n"
+" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
+" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
+" int input_offset=out_b_idx*srcChannelC4*4;\n"
+" int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
+" \n"
+" const int loop=(blockDim+15)/16;\n"
+" #ifdef INPUT_CHANNEL_LEAVE\n"
+" const int loop_end=max(loop-1,0);\n"
+" const int remain=blockDim-loop_end*16;\n"
+" #else\n"
+" const int loop_end=loop;\n"
+" #endif\n"
+" \n"
+" for (int i=0; i<blockNum; i++){\n"
+" int kindex=i*dstChannelC4*4*2;\n"
+" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
 " for (int j=0; j<loop_end; j++) {\n"
 " int k=i*loop+j;\n"
-" int k32=k << 5;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
+" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
 " {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
 " char16 charWeights0=0;\n"
 " char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
 " weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
 " }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
 " COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
 " #pragma unroll\n"
 " for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
 " out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights2_ptr[i],out1);\n"
-" }\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i+16)*4));\n"
-" out=mad(in,weights1_ptr[i],out);\n"
-" out1=mad(in,weights3_ptr[i],out1);\n"
+" out1=mad(in,weights1_ptr[i],out1);\n"
 " }\n"
 " }\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
 " {\n"
 " int k=i*loop+loop_end;\n"
-" int k32=k << 5;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
+" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
 " {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
+" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
 " char16 charWeights0=0;\n"
 " char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
+" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
+" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
 " weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
 " }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
 " COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
-" for (int i=0; i<min(16,remain); ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" #pragma unroll\n"
+" for (int i=0; i<remain; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
 " out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights2_ptr[i],out1);\n"
-" }\n"
-" for (int i=16; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
-" out=mad(in,weights1_ptr[i-16],out);\n"
-" out1=mad(in,weights3_ptr[i-16],out1);\n"
+" out1=mad(in,weights1_ptr[i],out1);\n"
 " }\n"
 " }\n"
 "#endif\n"
-"#endif //USE_LOW_BIT_WEIGHT_INT4\n"
 " }\n"
 "#ifdef RELU\n"
 " out=fmax(out,(COMPUTE_FLOAT4)0);\n"
@@ -7611,7 +7346,6 @@ const char* gemm_quant_batch_buf =
 " int input_offset=out_b_idx*srcChannelC4*4;\n"
 " int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
 " \n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " const int loop=(blockDim+15)/16;\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
@@ -7619,23 +7353,23 @@ const char* gemm_quant_batch_buf =
 " #else\n"
 " const int loop_end=loop;\n"
 " #endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" const int loop=(blockDim+31)/32;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*32;\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-"#endif\n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
 " COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " for (int j=0; j<loop_end; j++) {\n"
 " int k=i*loop+j;\n"
 " int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0;\n"
+" {\n"
+" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
 " #pragma unroll\n"
 " for (int i=0; i<16; ++i){\n"
@@ -7647,67 +7381,25 @@ const char* gemm_quant_batch_buf =
 " {\n"
 " int k=i*loop+loop_end;\n"
 " int k16=k << 4;\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" #pragma unroll\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" }\n"
-" }\n"
-"#endif\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k32=k << 5;\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" COMPUTE_FLOAT16 weights0;\n"
 " {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
 " }\n"
+" #endif\n"
 " COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" }\n"
 " #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i+16)*4));\n"
-" out=mad(in,weights1_ptr[i],out);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k32=k << 5;\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar16 charWeightsInt4=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" for (int i=0; i<min(16,remain); ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" for (int i=0; i<remain; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
 " out=mad(in,weights0_ptr[i],out);\n"
 " }\n"
-" for (int i=16; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
-" out=mad(in,weights1_ptr[i-16],out);\n"
-" }\n"
 " }\n"
 "#endif\n"
-"#endif //USE_LOW_BIT_WEIGHT_INT4\n"
 " }\n"
 "#ifdef RELU\n"
 " out=fmax(out,(COMPUTE_FLOAT4)0);\n"
@@ -11684,18 +11376,14 @@ const char* buffer_convert_quant =
 " __global char *output) {\n"
 " int image_width_idx=get_global_id(0); // ic\n"
 " int image_height_idx=get_global_id(1); // oc/4 h w\n"
-" \n"
 " DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" \n"
 " const int input_channel_4_idx=image_width_idx;\n"
 " const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
 " const int height_width_idx=image_height_idx % height_width_size;\n"
 " const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
 " const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
-" \n"
 " const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size +\n"
 " buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
-" \n"
 " char4 output_values=0;\n"
 " if (output_channel_4_idx<output_channel) {\n"
 " const int remain_channel=output_channel-output_channel_4_idx;\n"
@@ -11715,7 +11403,6 @@ const char* buffer_convert_quant =
 " output_values.y=(char)(*(input_ptr+offset));\n"
 " offset += ic_h_w_size;\n"
 " output_values.z=(char)(*(input_ptr+offset));\n"
-" \n"
 " } else if (remain_channel == 2) {\n"
 " int offset=buffer_offset;\n"
 " output_values.x=(char)(*(input_ptr+offset));\n"
@@ -11733,7 +11420,7 @@ const char* buffer_convert_quant =
 "#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
 "// convert kernel : from int8 buffer(oihw) to int4 image(oc/4 h w ,ic oc4)\n"
 "__kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS\n"
-" __global const char *input_ptr,\n"
+" __global const uchar *input_ptr,\n"
 " __private const int output_channel,\n"
 " __private const int2 kernel_shape,\n"
 " __private const int ic_h_w_size,\n"
@@ -11741,53 +11428,21 @@ const char* buffer_convert_quant =
 " __global uchar *output) {\n"
 " int image_width_idx=get_global_id(0); // ic\n"
 " int image_height_idx=get_global_id(1); // oc/4 h w\n"
-" \n"
 " DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" \n"
 " const int input_channel_4_idx=image_width_idx;\n"
 " const int output_channel_4_idx=(image_height_idx/height_width_size)*4;\n"
 " const int height_width_idx=image_height_idx % height_width_size;\n"
 " const int buffer_height_idx=height_width_idx/kernel_shape.y;\n"
 " const int buffer_width_idx=height_width_idx % kernel_shape.y;\n"
-" \n"
-" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size +\n"
-" buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
-" \n"
-" char4 output_values_int8=0;\n"
-" if (output_channel_4_idx<output_channel) {\n"
-" const int remain_channel=output_channel-output_channel_4_idx;\n"
-" if (remain_channel >= 4) {\n"
-" int offset=buffer_offset;\n"
-" output_values_int8.x=(char)(*(input_ptr+offset));\n"
-" offset=mad24(1,ic_h_w_size,offset);\n"
-" output_values_int8.y=(char)(*(input_ptr+offset));\n"
-" offset += ic_h_w_size;\n"
-" output_values_int8.z=(char)(*(input_ptr+offset));\n"
-" offset += ic_h_w_size;\n"
-" output_values_int8.w=(char)(*(input_ptr+offset));\n"
-" } else if (remain_channel == 3) {\n"
-" int offset=buffer_offset;\n"
-" output_values_int8.x=(char)(*(input_ptr+offset));\n"
-" offset=mad24(1,ic_h_w_size,offset);\n"
-" output_values_int8.y=(char)(*(input_ptr+offset));\n"
-" offset += ic_h_w_size;\n"
-" output_values_int8.z=(char)(*(input_ptr+offset));\n"
-" \n"
-" } else if (remain_channel == 2) {\n"
-" int offset=buffer_offset;\n"
-" output_values_int8.x=(char)(*(input_ptr+offset));\n"
-" offset=mad24(1,ic_h_w_size,offset);\n"
-" output_values_int8.y=(char)(*(input_ptr+offset));\n"
-" } else if (remain_channel == 1) {\n"
-" int offset=buffer_offset;\n"
-" output_values_int8.x=(char)(*(input_ptr+offset));\n"
-" }\n"
-" }\n"
-" \n"
+" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size+buffer_height_idx*kernel_shape.y+buffer_width_idx;\n"
+" int index0=buffer_offset,index1=buffer_offset+ic_h_w_size,index2=buffer_offset+2*ic_h_w_size,index3=buffer_offset+3*ic_h_w_size;\n"
 " uchar2 output_values_int4=(uchar2)(0,0);\n"
-" output_values_int4.s0=(output_values_int8.x+8)*16+(output_values_int8.y+8);\n"
-" output_values_int4.s1=(output_values_int8.z+8)*16+(output_values_int8.w+8);\n"
-" \n"
+" uchar s0=input_ptr[index0/2];\n"
+" uchar s1=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index1/2];\n"
+" uchar s2=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index2/2];\n"
+" uchar s3=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index3/2];\n"
+" output_values_int4.x=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
+" output_values_int4.y=((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));\n"
 " const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*2;\n"
 " vstore2(output_values_int4,0,output+out_offset);\n"
 "}\n"
@@ -11795,106 +11450,127 @@ const char* buffer_convert_quant =
 "#define CHAR16_TO_UCHAR8(a, b) "" a=(uchar8)(((b.s0+8) << 4)+b.s1+8,((b.s2+8) << 4)+b.s3+8,((b.s4+8) << 4)+b.s5+8,((b.s6+8) << 4)+b.s7+8,((b.s8+8) << 4)+b.s9+8,((b.sa+8) << 4)+b.sb+8,((b.sc+8) << 4)+b.sd+8,((b.se+8) << 4)+b.sf+8);\n"
 "#define CHAR32_TO_UCHAR16(a, b, c) "" a = (uchar16)(((b.s0 + 8) << 4) + b.s1 + 8, ((b.s2 + 8) << 4) + b.s3 + 8, ((b.s4 + 8) << 4) + b.s5 + 8, ((b.s6 + 8) << 4) + b.s7 + 8, ((b.s8 + 8) << 4) + b.s9 + 8, ((b.sa + 8) << 4) + b.sb + 8, ((b.sc + 8) << 4) + b.sd + 8, ((b.se + 8) << 4) + b.sf + 8, "" ((c.s0+8) << 4)+c.s1+8,((c.s2+8) << 4)+c.s3+8,((c.s4+8) << 4)+c.s5+8,((c.s6+8) << 4)+c.s7+8,((c.s8+8) << 4)+c.s9+8,((c.sa+8) << 4)+c.sb+8,((c.sc+8) << 4)+c.sd+8,((c.se+8) << 4)+c.sf+8);\n"
 "__kernel void conv2d_1x1_weight_quant_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global const char *input_ptr,\n"
 "#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
-" __global uchar *output_ptr,\n"
+" __global const uchar *input_ptr,\n"
 "#else\n"
-" __global char *output_ptr,\n"
+" __global const char *input_ptr,\n"
 "#endif\n"
+" __global char *output_ptr,\n"
 " __private const int input_channel,\n"
 " __private const int output_channel) {\n"
 " int x=get_global_id(0); // ic/16\n"
 " int y=get_global_id(1); // oc\n"
-" \n"
 " DEAL_NON_UNIFORM_DIM2(x,y);\n"
 " const int xin=x << 4;\n"
 " const int outputChannelC4=(output_channel+3) >> 2;\n"
-" const int inputOffset=y*input_channel+xin;\n"
-" char16 weight=0;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" if(xin+15 >= input_channel){\n"
-" char *weight_ptr=(char*)&weight;\n"
-" for(int i=0,j=0; xin+i<input_channel && j<16; ++i,++j){\n"
-" weight_ptr[j]=input_ptr[inputOffset+i];\n"
-" }\n"
-" }else {\n"
-" weight=vload16(0,input_ptr+inputOffset);\n"
+"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+" const int outputOffset=((x*outputChannelC4*4*8+y*8));\n"
+"#ifdef CHANNEL_LEAVE\n"
+" for(int i=0; i<8; ++i){\n"
+" int index0=y*input_channel+xin+i*2;\n"
+" int index1=y*input_channel+xin+i*2+1;\n"
+" uchar s0=input_ptr[index0/2];\n"
+" uchar s1=input_ptr[index1/2];\n"
+" output_ptr[outputOffset+i]=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
 " }\n"
 "#else\n"
-" weight=vload16(0,input_ptr+inputOffset);\n"
+" const int inputOffset=(y*input_channel+xin)/2;\n"
+" vstore8(convert_char8(vload8(0,input_ptr+inputOffset)),0,output_ptr+outputOffset);\n"
 "#endif\n"
-" \n"
-"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
-" const int outputOffset=((x*outputChannelC4*4*8+y*8));\n"
-" uchar8 outWeight;\n"
-" CHAR16_TO_UCHAR8(outWeight,weight);\n"
-" vstore8(outWeight,0,output_ptr+outputOffset);\n"
 "#else\n"
+" const int inputOffset=y*input_channel+xin;\n"
 " const int outputOffset=(x*outputChannelC4*4+y) << 4;\n"
-" vstore16(weight,0,output_ptr+outputOffset);\n"
+" vstore16(convert_char16(vload16(0,input_ptr+inputOffset)),0,output_ptr+outputOffset);\n"
 "#endif\n"
 "}\n"
 "__kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS\n"
-" __global const char *input_ptr,\n"
+"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+" __global const uchar *input_ptr,\n"
+"#else\n"
+" __global const uchar *input_ptr,\n"
+"#endif\n"
 " __write_only image2d_t output,\n"
 " __private const int input_channel,\n"
 " __private const int output_channel) {\n"
-" \n"
-"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
-" int x=get_global_id(0); // ic/32\n"
+" int x=get_global_id(0); // ic/16\n"
 " int y=get_global_id(1); // oc\n"
-" \n"
 " DEAL_NON_UNIFORM_DIM2(x,y);\n"
-" const int outputChannelC4=(output_channel+3) >> 2;\n"
-" const int xin=x << 5;\n"
-" const int inputOffset=y*input_channel+xin;\n"
-" char16 weight00=0,weight01=0;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" if(xin+31 >= input_channel){\n"
-" char *weight00_ptr=(char*)&weight00;\n"
-" char *weight01_ptr=(char*)&weight01;\n"
-" int i=0;\n"
-" for(int j=0; xin+i<input_channel && j<16; ++i,++j){\n"
-" weight00_ptr[j]=input_ptr[inputOffset+i];\n"
-" }\n"
-" for(int j=0; xin+i<input_channel && j<16; ++i,++j){\n"
-" weight01_ptr[j]=input_ptr[inputOffset+i];\n"
-" }\n"
-" }else {\n"
-" weight00=vload16(0,input_ptr+inputOffset);\n"
-" weight01=vload16(0,input_ptr+inputOffset+16);\n"
-" }\n"
+" const int xin=x << 4;\n"
+"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+"#ifdef CHANNEL_LEAVE\n"
+" uchar8 out=0;\n"
+" uchar *out_ptr=(uchar*)&out;\n"
+" for(int i=0; i<8; ++i){\n"
+" int index0=y*input_channel+xin+i*2;\n"
+" int index1=y*input_channel+xin+i*2+1;\n"
+" uchar s0=input_ptr[index0/2];\n"
+" uchar s1=input_ptr[index1/2];\n"
+" out_ptr[i]=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
+" }\n"
+" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(out)));\n"
 "#else\n"
-" weight00=vload16(0,input_ptr+inputOffset);\n"
-" weight01=vload16(0,input_ptr+inputOffset+16);\n"
+" const int inputOffset=(y*input_channel+xin)/2;\n"
+" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(vload8(0,input_ptr+inputOffset))));\n"
 "#endif\n"
-" \n"
-" uchar16 outWeight;\n"
-" CHAR32_TO_UCHAR16(outWeight,weight00,weight01);\n"
-" write_imagei(output,(int2)(y,x),as_int4(outWeight));\n"
 "#else\n"
-" int x=get_global_id(0); // ic/16\n"
-" int y=get_global_id(1); // oc\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(x,y);\n"
-" const int xin=x << 4;\n"
 " const int inputOffset=y*input_channel+xin;\n"
-" const int outputChannelC4=(output_channel+3) >> 2;\n"
-" char16 weight=0;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" if(xin+15 >= input_channel){\n"
-" char *weight_ptr=(char*)&weight;\n"
-" for(int i=0,j=0; xin+i<input_channel && j<16; ++i,++j){\n"
-" weight_ptr[j]=input_ptr[inputOffset+i];\n"
+" write_imagei(output,(int2)(y,x),as_int4(vload16(0,input_ptr+inputOffset)));\n"
+"#endif\n"
+"}\n"
+"__kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS\n"
+"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+" __global const uchar *input_ptr,\n"
+" __global uchar *output_ptr,//(Ci/packCin， Co/packCout,packCin， packCout)\n"
+"#else\n"
+" __global const char *input_ptr,\n"
+" __global char *output_ptr,//(Ci/packCin， Co/packCout,packCin， packCout)\n"
+"#endif\n"
+" __private const int input_channel,\n"
+" __private const int output_channel,\n"
+" __private const int icPack,\n"
+" __private const int ocPack) {\n"
+" int x=get_global_id(0); // ic/icPack\n"
+" int y=get_global_id(1); // oc/ocPack\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int xin=x*icPack;\n"
+" const int yin=y*ocPack;\n"
+" const int inputChannelC4=(input_channel+icPack-1)/icPack;\n"
+" const int outputChannelC4=(output_channel+ocPack-1)/ocPack;\n"
+"#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+" const int inputOffset=(yin*input_channel+xin)/2;\n"
+" const int outputOffset=((x*outputChannelC4+y)*icPack*ocPack)/2;\n"
+"#ifdef CHANNEL_LEAVE\n"
+" for(int i=0; i<icPack; ++i){\n"
+" for(int j=0; j<ocPack/2; ++j){\n"
+" int index0=(yin+j*2)*input_channel+xin+i;\n"
+" int index1=(yin+j*2+1)*input_channel+xin+i;\n"
+" uchar s0=input_ptr[index0/2];\n"
+" uchar s1=input_ptr[index1/2];\n"
+" s0=(index0 % 2) == 0 ? (s0 & 0xf0) : ((s0 & 0x0f) << 4);\n"
+" s1=(index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f);\n"
+" output_ptr[outputOffset+i*(ocPack/2)+j]=s0 | s1;\n"
 " }\n"
-" }else {\n"
-" weight=vload16(0,input_ptr+inputOffset);\n"
 " }\n"
 "#else\n"
-" weight=vload16(0,input_ptr+inputOffset);\n"
+" for(int i=0; i<icPack/2; ++i){\n"
+" for(int j=0; j<ocPack/2; ++j){\n"
+" char s0=input_ptr[inputOffset+(j*2)*(input_channel/2)+i];\n"
+" char s1=input_ptr[inputOffset+(j*2+1)*(input_channel/2)+i];\n"
+" char d0=(s0 & 0xf0) | ((s1 & 0xf0) >> 4);\n"
+" char d1=((s0 & 0x0f) << 4) | (s1 & 0x0f);\n"
+" output_ptr[outputOffset+(i*2)*(ocPack/2)+j]=d0;\n"
+" output_ptr[outputOffset+(i*2+1)*(ocPack/2)+j]=d1;\n"
+" }\n"
+" }\n"
 "#endif\n"
-" \n"
-" write_imagei(output,(int2)(y,x),as_int4(weight));\n"
+"#else\n"
+" const int inputOffset=yin*input_channel+xin;\n"
+" const int outputOffset=(x*outputChannelC4+y)*icPack*ocPack;\n"
+" for(int i=0; i<icPack; ++i){\n"
+" for(int j=0; j<ocPack; ++j){\n"
+" output_ptr[outputOffset+i*ocPack+j]=input_ptr[inputOffset+j*input_channel+i];\n"
+" }\n"
+" }\n"
 "#endif\n"
 "}\n"
 ;
@@ -13623,8 +13299,8 @@ const char* groupnorm_buf =
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)inside;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
+" float4 square_sum=(float4)(sum[0]/inside);\n"
+" float4 value=(float4)(1.0f/sqrt(square_sum.x+epsilon));\n"
 " for(int i=lid; i<inside_v4; i+=LOCAL_SIZE){\n"
 " float4 in0=convert_float4(vload4(i,input0+offset));\n"
 " float in1=input1[idx_out*(inside/area)+i/(area/4)];\n"
@@ -13655,7 +13331,6 @@ const char* groupnorm_buf =
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" \n"
 " float mean=sum[0]/inside;\n"
 " in_sum=0;\n"
 " index=lid;\n"
@@ -13721,7 +13396,7 @@ const char* groupnorm_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " \n"
-" float4 mean=sum[0]/(float4)inside;\n"
+" float4 mean=(float4)(sum[0]/inside);\n"
 " in_sum=0;\n"
 " index=lid;\n"
 " for(; index<inside_v4-1; index+=LOCAL_SIZE){\n"
@@ -13750,8 +13425,8 @@ const char* groupnorm_buf =
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)inside;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
+" float4 square_sum=(float4)(sum[0]/inside);\n"
+" float4 value=(float4)(1.0f/sqrt(square_sum.x+epsilon));\n"
 " // The product of W and H is a multiple of 4\n"
 " #ifdef WH_4\n"
 " for(int i=lid; i<inside_v4; i+=LOCAL_SIZE){\n"
@@ -13777,6 +13452,7 @@ const char* groupnorm_buf =
 " #ifdef SWISH\n"
 " out=out*native_recip(1.0+native_exp(-out));\n"
 " #endif\n"
+" \n"
 " output[offset+i]=(FLOAT)out;\n"
 " }\n"
 " #endif\n"
@@ -19369,6 +19045,104 @@ const char* reduction_buf =
 ;
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* strassen_binary_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"__kernel void binary_cfunction_buf(__private int global_dim0,__private int global_dim1,\n"
+" __global FLOAT* input0,\n"
+" __private const int offsetC,\n"
+" __private const int strideC,\n"
+" __global FLOAT* input1,__global FLOAT* output,\n"
+" __private const int width,//[offsetA,offsetB,offsetC,0]\n"
+" __private const int height//[strideA,strideB,strideC,0]\n"
+") {\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));// [X/16,Y]\n"
+" \n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" int offset_11=offsetC+pos.x*8+pos.y*strideC;\n"
+" int offset_12=offset_11+width;\n"
+" int offset_21=offset_11+strideC*height;\n"
+" int offset_22=offset_21+width;\n"
+" FLOAT8 in_11=vload8(0,input0+offset_11);\n"
+" FLOAT8 in_12=vload8(0,input0+offset_12);\n"
+" FLOAT8 in_21=vload8(0,input0+offset_21);\n"
+" FLOAT8 in_22=vload8(0,input0+offset_22);\n"
+" FLOAT8 in_cx=vload8(0,input1+pos.x*8+pos.y*width);\n"
+" in_12=in_12+in_cx;\n"
+" in_21=in_12+in_21;\n"
+" in_12=in_22+in_12;\n"
+" in_22=in_22+in_21;\n"
+" in_12=in_11+in_12;\n"
+" vstore8(in_21,0,output+offset_21);\n"
+" vstore8(in_22,0,output+offset_22);\n"
+" vstore8(in_12,0,output+offset_12);\n"
+" }\n"
+"}\n"
+"#ifndef OPERATOR\n"
+"#define OPERATOR in0+in1\n"
+"#endif\n"
+"__kernel void binary_function_buf(__private int global_dim0,__private int global_dim1,\n"
+" __global FLOAT* input0,__global FLOAT* input1,__global FLOAT* output,\n"
+" __private const int4 baseOffsets,//[offsetA,offsetB,offsetC,0]\n"
+" __private const int4 strides//[strideA,strideB,strideC,0]\n"
+") {\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));// [X/16,Y]\n"
+" \n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" const int baseOffsetA=baseOffsets.x;\n"
+" const int baseOffsetB=baseOffsets.y;\n"
+" const int baseOffsetC=baseOffsets.z;\n"
+" const int strideA=strides.x;\n"
+" const int strideB=strides.y;\n"
+" const int strideC=strides.z;\n"
+" \n"
+" \n"
+" int offsetA=pos.x*8+pos.y*VEC_H*strideA+baseOffsetA;\n"
+" int offsetB=pos.x*8+pos.y*VEC_H*strideB+baseOffsetB;\n"
+" int offsetC=pos.x*8+pos.y*VEC_H*strideC+baseOffsetC;\n"
+" {\n"
+" FLOAT8 in0=vload8(0,input0+offsetA);\n"
+" FLOAT8 in1=vload8(0,input1+offsetB);\n"
+" FLOAT8 out=OPERATOR;\n"
+" vstore8(out,0,output+offsetC);\n"
+" }\n"
+" #if VEC_H >= 2\n"
+" {\n"
+" offsetA += strideA;\n"
+" offsetB += strideB;\n"
+" offsetC += strideC;\n"
+" FLOAT8 in0=vload8(0,input0+offsetA);\n"
+" FLOAT8 in1=vload8(0,input1+offsetB);\n"
+" FLOAT8 out=OPERATOR;\n"
+" vstore8(out,0,output+offsetC);\n"
+" }\n"
+" #endif\n"
+" #if VEC_H == 4\n"
+" {\n"
+" offsetA += strideA;\n"
+" offsetB += strideB;\n"
+" offsetC += strideC;\n"
+" FLOAT8 in0=vload8(0,input0+offsetA);\n"
+" FLOAT8 in1=vload8(0,input1+offsetB);\n"
+" FLOAT8 out=OPERATOR;\n"
+" vstore8(out,0,output+offsetC);\n"
+" }\n"
+" {\n"
+" offsetA += strideA;\n"
+" offsetB += strideB;\n"
+" offsetC += strideC;\n"
+" FLOAT8 in0=vload8(0,input0+offsetA);\n"
+" FLOAT8 in1=vload8(0,input1+offsetB);\n"
+" FLOAT8 out=OPERATOR;\n"
+" vstore8(out,0,output+offsetC);\n"
+" }\n"
+" #endif\n"
+" }\n"
+"}\n"
+;
+#endif
+#ifndef MNN_OPENCL_BUFFER_CLOSED
 const char* matmul_params_buf = 
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
@@ -19442,6 +19216,24 @@ const char* matmul_params_buf =
 "#ifndef USE_CL_MAD\n"
 " #define USE_CL_MAD 0\n"
 "#endif\n"
+"// BIAS_TYPE\n"
+"// 0 -> without bias\n"
+"// 1 -> with bias (add) [N]\n"
+"// 2 -> with bias (eltwise_add) [M,N]\n"
+"// 3 -> with bias (eltwise_sub) [M,N]\n"
+"// 4 -> with bias (eltwise_sub and get negative) [M,N]\n"
+"#ifndef BIAS_TYPE\n"
+" #define BIAS_TYPE 0\n"
+"#endif\n"
+"#if BIAS_TYPE == 1\n"
+"#define DEAL_BIAS(x,a) x=x+a\n"
+"#elif BIAS_TYPE == 2\n"
+"#define DEAL_BIAS(x,a) x=x+a\n"
+"#elif BIAS_TYPE == 3\n"
+"#define DEAL_BIAS(x,a) x=x-a\n"
+"#elif BIAS_TYPE == 4\n"
+"#define DEAL_BIAS(x,a) x=a-x\n"
+"#endif\n"
 "// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size\n"
 "// requirement results in worse performance and is disabled (src/utilities/compile.cpp)\n"
 "#ifndef RELAX_WORKGROUP_SIZE\n"
@@ -19652,7 +19444,7 @@ const char* matmul_params_buf =
 " return idm;\n"
 "}\n"
 "INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm,const int base,const int _mi,\n"
-" const int kSizeM,const int idk) {\n"
+" const int astride/*kSizeM*/,const int idk) {\n"
 " // Computes the indices based on strided/non-strided access\n"
 " #if STRM == 0\n"
 " // [MWG/MWI,MWI/VWM,VWM]\n"
@@ -19663,7 +19455,7 @@ const char* matmul_params_buf =
 " #endif\n"
 " // Loads the data from global memory (not transposed) and stores into registers\n"
 " // [kSizeK,kSizeM/VWM,VWM]\n"
-" return agm[idk*(kSizeM/VWM)+idm];\n"
+" return agm[idk*(astride/VWM)+idm];\n"
 "}\n"
 "INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm,const int _mi,\n"
 " const int kSizeM,const int idk) {\n"
@@ -19697,7 +19489,7 @@ const char* matmul_params_buf =
 " return idn;\n"
 "}\n"
 "INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm,const int base,const int _ni,\n"
-" const int kSizeN,const int idk) {\n"
+" const int bstride/*kSizeN*/,const int idk) {\n"
 " // Computes the indices based on strided/non-strided access\n"
 " #if STRN == 0\n"
 " int idn=base+_ni;\n"
@@ -19705,7 +19497,7 @@ const char* matmul_params_buf =
 " int idn=base+_ni*NDIMC;\n"
 " #endif\n"
 " // Loads the data from global memory (transposed) and stores into registers\n"
-" return bgm[idk*(kSizeN/VWN)+idn];\n"
+" return bgm[idk*(bstride/VWN)+idn];\n"
 "}\n"
 "INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm,const int _ni,\n"
 " const int kSizeN,const int idk) {\n"
@@ -19990,11 +19782,15 @@ const char* matmul_params_buf =
 "// layout : [M,N]\n"
 "INLINE_FUNC void StoreResultsN(__global realN* cgn,realN c_value,\n"
 " const INT2 baseOffset,\n"
-" #ifdef BIAS\n"
+" #if BIAS_TYPE>0\n"
+" #if BIAS_TYPE>1\n"
+" __global realN* egm,\n"
+" #else\n"
 " realN* epm,\n"
 " #endif\n"
+" #endif\n"
 " const int _mi,const int _ni,\n"
-" const int kSizeN,const real alpha,const real beta) {\n"
+" const int cstride/*kSizeN*/,const int dstride/*kSizeN*/,const real alpha,const real beta) {\n"
 " #if STRM == 0\n"
 " int idm=_mi+baseOffset.index[0];\n"
 " #elif STRM == 1\n"
@@ -20005,7 +19801,8 @@ const char* matmul_params_buf =
 " #elif STRN == 1\n"
 " int idn=baseOffset.index[1]+_ni*NDIMC;\n"
 " #endif\n"
-" int index=idm*(kSizeN/VWN)+idn;\n"
+" int index=idm*(cstride/VWN)+idn;\n"
+" \n"
 " realN result=c_value;\n"
 " \n"
 " // The final multiplication with alpha (in case beta == 0)\n"
@@ -20093,10 +19890,17 @@ const char* matmul_params_buf =
 " #endif\n"
 " \n"
 " \n"
-"#ifdef BIAS\n"
+"#if BIAS_TYPE>0\n"
+" #if BIAS_TYPE == 1\n"
 " realN eval=epm[_ni];\n"
+" #else\n"
+" \n"
+" int index_bias=idm*(dstride/VWN)+idn;\n"
+" realN eval=egm[index_bias];\n"
+" #endif\n"
+" \n"
 " #if VWN == 1\n"
-" result += eval;\n"
+" DEAL_BIAS(result,eval);\n"
 " #ifdef RELU\n"
 " result=fmax(result,(FLOAT)0);\n"
 " #endif\n"
@@ -20104,8 +19908,8 @@ const char* matmul_params_buf =
 " result=clamp(result,(FLOAT)0,(FLOAT)6);\n"
 " #endif\n"
 " #elif VWN == 2\n"
-" result.x += eval.x;\n"
-" result.y += eval.y;\n"
+" DEAL_BIAS(result.x,eval.x);\n"
+" DEAL_BIAS(result.y,eval.y);\n"
 " #ifdef RELU\n"
 " result=fmax(result,(FLOAT2)0);\n"
 " #endif\n"
@@ -20113,10 +19917,10 @@ const char* matmul_params_buf =
 " result=clamp(result,(FLOAT2)0,(FLOAT2)6);\n"
 " #endif\n"
 " #elif VWN == 4\n"
-" result.x += eval.x;\n"
-" result.y += eval.y;\n"
-" result.z += eval.z;\n"
-" result.w += eval.w;\n"
+" DEAL_BIAS(result.x,eval.x);\n"
+" DEAL_BIAS(result.y,eval.y);\n"
+" DEAL_BIAS(result.z,eval.z);\n"
+" DEAL_BIAS(result.w,eval.w);\n"
 " #ifdef RELU\n"
 " result=fmax(result,(FLOAT4)0);\n"
 " #endif\n"
@@ -20124,14 +19928,14 @@ const char* matmul_params_buf =
 " result=clamp(result,(FLOAT4)0,(FLOAT4)6);\n"
 " #endif\n"
 " #elif VWN == 8\n"
-" result.s0 += eval.s0;\n"
-" result.s1 += eval.s1;\n"
-" result.s2 += eval.s2;\n"
-" result.s3 += eval.s3;\n"
-" result.s4 += eval.s4;\n"
-" result.s5 += eval.s5;\n"
-" result.s6 += eval.s6;\n"
-" result.s7 += eval.s7;\n"
+" DEAL_BIAS(result.s0,eval.s0);\n"
+" DEAL_BIAS(result.s1,eval.s1);\n"
+" DEAL_BIAS(result.s2,eval.s2);\n"
+" DEAL_BIAS(result.s3,eval.s3);\n"
+" DEAL_BIAS(result.s4,eval.s4);\n"
+" DEAL_BIAS(result.s5,eval.s5);\n"
+" DEAL_BIAS(result.s6,eval.s6);\n"
+" DEAL_BIAS(result.s7,eval.s7);\n"
 " #ifdef RELU\n"
 " result=fmax(result,(FLOAT8)0);\n"
 " #endif\n"
@@ -20139,22 +19943,22 @@ const char* matmul_params_buf =
 " result=clamp(result,(FLOAT8)0,(FLOAT8)6);\n"
 " #endif\n"
 " #elif VWN == 16\n"
-" result.s0 += eval.s0;\n"
-" result.s1 += eval.s1;\n"
-" result.s2 += eval.s2;\n"
-" result.s3 += eval.s3;\n"
-" result.s4 += eval.s4;\n"
-" result.s5 += eval.s5;\n"
-" result.s6 += eval.s6;\n"
-" result.s7 += eval.s7;\n"
-" result.s8 += eval.s8;\n"
-" result.s9 += eval.s9;\n"
-" result.sA += eval.sA;\n"
-" result.sB += eval.sB;\n"
-" result.sC += eval.sC;\n"
-" result.sD += eval.sD;\n"
-" result.sE += eval.sE;\n"
-" result.sF += eval.sF;\n"
+" DEAL_BIAS(result.s0,eval.s0);\n"
+" DEAL_BIAS(result.s1,eval.s1);\n"
+" DEAL_BIAS(result.s2,eval.s2);\n"
+" DEAL_BIAS(result.s3,eval.s3);\n"
+" DEAL_BIAS(result.s4,eval.s4);\n"
+" DEAL_BIAS(result.s5,eval.s5);\n"
+" DEAL_BIAS(result.s6,eval.s6);\n"
+" DEAL_BIAS(result.s7,eval.s7);\n"
+" DEAL_BIAS(result.s8,eval.s8);\n"
+" DEAL_BIAS(result.s9,eval.s9);\n"
+" DEAL_BIAS(result.sA,eval.sA);\n"
+" DEAL_BIAS(result.sB,eval.sB);\n"
+" DEAL_BIAS(result.sC,eval.sC);\n"
+" DEAL_BIAS(result.sD,eval.sD);\n"
+" DEAL_BIAS(result.sE,eval.sE);\n"
+" DEAL_BIAS(result.sF,eval.sF);\n"
 " #ifdef RELU\n"
 " result=fmax(result,(FLOAT16)0);\n"
 " #endif\n"
@@ -20166,10 +19970,10 @@ const char* matmul_params_buf =
 " cgn[index]=result;\n"
 "}\n"
 "// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.\n"
-"INLINE_FUNC void XgemmBody(const int kSizeM,const int kSizeN,const int kSizeK,\n"
+"INLINE_FUNC void XgemmBody(const int kSizeM,const int kSizeN,const int kSizeK,const int4 stride,\n"
 " const __global realM* restrict agm,const __global realN* restrict bgm,\n"
-" #ifdef BIAS\n"
-" const __global realN* restrict egm,\n"
+" #if BIAS_TYPE>0\n"
+" __global realN* restrict egm,\n"
 " #endif\n"
 " __global realM* cgm,const real alpha,const real beta\n"
 " #if SA == 1 && SB == 1\n"
@@ -20370,11 +20174,11 @@ const char* matmul_params_buf =
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " // Loads data: off-chip --> private (matrix B)\n"
-" bpm[_ni]=GlobalToPrivateOptB(bgm,baseIndexB,_ni,kSizeN,idk);\n"
+" bpm[_ni]=GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk);\n"
 " }\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
-" const realM aval=GlobalToPrivateOptA(agm,baseIndexA,_mi,kSizeM,idk);\n"
+" const realM aval=GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk);\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " #if VWM == 1\n"
@@ -20428,11 +20232,11 @@ const char* matmul_params_buf =
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
 " // Loads data: off-chip --> private (matrix B)\n"
-" apm[_mi]=GlobalToPrivateOptA(agm,baseIndexA,_mi,kSizeM,idk);\n"
+" apm[_mi]=GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk);\n"
 " }\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
-" const realN bval=GlobalToPrivateOptB(bgm,baseIndexB,_ni,kSizeN,idk);\n"
+" const realN bval=GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk);\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
 " const realM aval=apm[_mi];\n"
@@ -20485,7 +20289,7 @@ const char* matmul_params_buf =
 " #endif\n"
 " #ifdef OUTPUTMN\n"
 " INT2 baseOffset=StoreIndexN();\n"
-" #ifdef BIAS\n"
+" #if BIAS_TYPE == 1\n"
 " #pragma promote_to_registers\n"
 " realN epm[NWI/VWN]; // MWI*1\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
@@ -20497,16 +20301,21 @@ const char* matmul_params_buf =
 " epm[_ni]=egm[idn];\n"
 " }\n"
 " #endif\n"
+" \n"
+" \n"
+" \n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI; _mi += 1) {\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " StoreResultsN((__global realN* )cgm,cpn[_mi*(NWI/VWN)+_ni],\n"
 " baseOffset,\n"
-" #ifdef BIAS\n"
+" #if BIAS_TYPE>1\n"
+" (__global realN*)egm,\n"
+" #elif BIAS_TYPE == 1\n"
 " (realN*)epm,\n"
 " #endif\n"
-" _mi,_ni,kSizeN,alpha,beta);\n"
+" _mi,_ni,stride.s2,stride.s3,alpha,beta);\n"
 " }\n"
 " }\n"
 " \n"
@@ -20535,20 +20344,24 @@ const char* matmul_params_buf =
 " const real_arg arg_beta,\n"
 " const __global realM* restrict agm,// [K,M]\n"
 " const __global realN* restrict bgm,// [K,N]\n"
-" #ifdef BIAS\n"
-" const __global realN* restrict egm,// [N]\n"
+" #if BIAS_TYPE>0\n"
+" __global realN* restrict egm,// [N]\n"
 " #endif\n"
 " __global realM* cgm,\n"
-" const int a_offset,const int b_offset,const int c_offset\n"
+" __private const int4 offset,\n"
+" __private const int4 stride\n"
 ") {\n"
 " const real alpha=GetRealArg(arg_alpha);\n"
 " const real beta=GetRealArg(arg_beta);\n"
 " \n"
 " // Adds the offsets (in case of use of a single temporary buffer for A,B,and C)\n"
-" agm=(const __global realM*)((const __global real*)agm+a_offset);\n"
-" bgm=(const __global realN*)((const __global real*)bgm+b_offset);\n"
-" cgm=(__global realM*)((const __global real*)cgm+c_offset);\n"
+" agm=(const __global realM*)((const __global real*)agm+offset.s0);\n"
+" bgm=(const __global realN*)((const __global real*)bgm+offset.s1);\n"
+" cgm=(__global realM*)((__global real*)cgm+offset.s2);\n"
 " \n"
+" #if BIAS_TYPE>0\n"
+" egm=(__global realN*)((__global real*)egm+offset.s3);\n"
+" #endif\n"
 " // Allocates workgroup-private memory (local memory)\n"
 " #if SA == 1\n"
 " __local realM alm[KWG*MWG/VWM];\n"
@@ -20559,26 +20372,26 @@ const char* matmul_params_buf =
 " \n"
 " // Computes the matrix-multiplication and stores the result in global memory\n"
 " #if SA == 1 && SB == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
+" #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
 " cgm,alpha,beta,alm,blm);\n"
 " #elif SA == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
+" #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
 " cgm,alpha,beta,alm);\n"
 " #elif SB == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
+" #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
 " cgm,alpha,beta,blm);\n"
 " #else\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
+" #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
 " cgm,alpha,beta);\n"
@@ -20589,15 +20402,21 @@ const char* matmul_params_buf =
 "#else\n"
 " __kernel __attribute__((reqd_work_group_size(MDIMC,NDIMC,1)))\n"
 "#endif\n"
-"void XgemmBatched(const int kSizeM,const int kSizeN,const int kSizeK,\n"
+"void XgemmBatched(const int kSizeM,\n"
+" const int kSizeN,\n"
+" const int kSizeK,\n"
 " const real_arg arg_alpha,\n"
 " const real_arg arg_beta,\n"
-" const __global realM* restrict agm,const int batch_offset_a,\n"
-" const __global realN* restrict bgm,const int batch_offset_b,\n"
-" #ifdef BIAS\n"
-" const __global realN* restrict egm,const int batch_offset_e,\n"
+" const __global realM* restrict agm,\n"
+" const int batch_offset_a,\n"
+" const __global realN* restrict bgm,\n"
+" const int batch_offset_b,\n"
+" #if BIAS_TYPE>0\n"
+" __global realN* restrict egm,\n"
+" const int batch_offset_e,\n"
 " #endif\n"
-" __global realM* cgm,const int batch_offset_c) {\n"
+" __global realM* cgm,\n"
+" const int batch_offset_c) {\n"
 " const int batch=get_group_id(2);\n"
 " const real alpha=GetRealArg(arg_alpha);\n"
 " const real beta=GetRealArg(arg_beta);\n"
@@ -20610,9 +20429,9 @@ const char* matmul_params_buf =
 " const __global realN* restrict bgm_=&bgm[b_offset/VWN];\n"
 " __global realM* restrict cgm_=&cgm[c_offset/VWM];\n"
 " \n"
-" #ifdef BIAS\n"
+" #if BIAS_TYPE>0\n"
 " const int e_offset=batch*batch_offset_e;\n"
-" const __global realN* restrict egm_=&egm[e_offset/VWN];\n"
+" __global realN* restrict egm_=&egm[e_offset/VWN];\n"
 " #endif\n"
 " \n"
 " // Allocates workgroup-private memory (local memory)\n"
@@ -20622,29 +20441,37 @@ const char* matmul_params_buf =
 " #if SB == 1\n"
 " __local realN blm[KWG*NWG/VWN];\n"
 " #endif\n"
-" \n"
+" int4 stride;\n"
+" stride.s0=kSizeM;\n"
+" stride.s1=kSizeN;\n"
+" #ifdef OUTPUTMN\n"
+" stride.s2=kSizeN;\n"
+" #else\n"
+" stride.s2=kSizeM;\n"
+" #endif\n"
+" stride.s3=kSizeN;\n"
 " // Computes the matrix-multiplication and stores the result in global memory\n"
 " #if SA == 1 && SB == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
+" #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
 " cgm_,alpha,beta,alm,blm);\n"
 " #elif SA == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
+" #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
 " cgm_,alpha,beta,alm);\n"
 " #elif SB == 1\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
+" #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
 " cgm_,alpha,beta,blm);\n"
 " #else\n"
-" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n"
-" #ifdef BIAS\n"
+" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
+" #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
 " cgm_,alpha,beta);\n"
diff --git a/source/backend/opencl/execution/cl/opencl_source_map.hpp b/source/backend/opencl/execution/cl/opencl_source_map.hpp
index 8092ae61e..6ec7a2399 100644
--- a/source/backend/opencl/execution/cl/opencl_source_map.hpp
+++ b/source/backend/opencl/execution/cl/opencl_source_map.hpp
@@ -150,6 +150,9 @@ extern const char* input_transe_buf;
 extern const char* reduction_buf;
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
+extern const char* strassen_binary_buf;
+#endif
+#ifndef MNN_OPENCL_BUFFER_CLOSED
 extern const char* matmul_params_buf;
 #endif
 extern const char* cast;
@@ -317,6 +320,9 @@ const std::map<std::string, const char*> OpenCLProgramMap =
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "reduction_buf", reduction_buf },
 #endif
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+  { "strassen_binary_buf", strassen_binary_buf },
+#endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "matmul_params_buf", matmul_params_buf },
 #endif
diff --git a/source/backend/opencl/execution/cl/strassen_binary_buf.cl b/source/backend/opencl/execution/cl/strassen_binary_buf.cl
new file mode 100644
index 000000000..76894d266
--- /dev/null
+++ b/source/backend/opencl/execution/cl/strassen_binary_buf.cl
@@ -0,0 +1,101 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void binary_cfunction_buf(__private int global_dim0, __private int global_dim1,
+                         __global FLOAT* input0,
+                         __private const int offsetC,
+                         __private const int strideC,
+                         __global FLOAT* input1, __global FLOAT* output,
+                         __private const int width,//[offsetA, offsetB, offsetC, 0]
+                         __private const int height//[strideA, strideB, strideC, 0]
+) {
+    int2 pos = (int2)(get_global_id(0), get_global_id(1));// [X/16, Y]
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1) {
+        int offset_11 = offsetC + pos.x * 8 + pos.y * strideC;
+        int offset_12 = offset_11 + width;
+        int offset_21 = offset_11 + strideC * height;
+        int offset_22 = offset_21 + width;
+
+        FLOAT8 in_11 = vload8(0, input0 + offset_11);
+        FLOAT8 in_12 = vload8(0, input0 + offset_12);
+        FLOAT8 in_21 = vload8(0, input0 + offset_21);
+        FLOAT8 in_22 = vload8(0, input0 + offset_22);
+        FLOAT8 in_cx = vload8(0, input1 + pos.x * 8 + pos.y * width);
+
+        in_12 = in_12 + in_cx;
+        in_21 = in_12 + in_21;
+        in_12 = in_22 + in_12;
+        in_22 = in_22 + in_21;
+        in_12 = in_11 + in_12;
+
+        vstore8(in_21, 0, output + offset_21);
+        vstore8(in_22, 0, output + offset_22);
+        vstore8(in_12, 0, output + offset_12);
+    }
+}
+
+#ifndef OPERATOR
+#define OPERATOR in0+in1
+#endif
+
+__kernel void binary_function_buf(__private int global_dim0, __private int global_dim1,
+                         __global FLOAT* input0, __global FLOAT* input1, __global FLOAT* output,
+                         __private const int4 baseOffsets,//[offsetA, offsetB, offsetC, 0]
+                         __private const int4 strides//[strideA, strideB, strideC, 0]
+) {
+    int2 pos = (int2)(get_global_id(0), get_global_id(1));// [X/16, Y]
+    
+    if (pos.x < global_dim0 && pos.y < global_dim1) {
+        const int baseOffsetA = baseOffsets.x;
+        const int baseOffsetB = baseOffsets.y;
+        const int baseOffsetC = baseOffsets.z;
+        const int strideA = strides.x;
+        const int strideB = strides.y;
+        const int strideC = strides.z;
+        
+        
+        int offsetA = pos.x * 8 + pos.y * VEC_H * strideA + baseOffsetA;
+        int offsetB = pos.x * 8 + pos.y * VEC_H * strideB + baseOffsetB;
+        int offsetC = pos.x * 8 + pos.y * VEC_H * strideC + baseOffsetC;
+
+        {
+            FLOAT8 in0 = vload8(0, input0 + offsetA);
+            FLOAT8 in1 = vload8(0, input1 + offsetB);
+            FLOAT8 out = OPERATOR;
+            vstore8(out, 0, output + offsetC);
+        }
+        #if VEC_H >= 2
+        {
+            offsetA += strideA;
+            offsetB += strideB;
+            offsetC += strideC;
+            FLOAT8 in0 = vload8(0, input0 + offsetA);
+            FLOAT8 in1 = vload8(0, input1 + offsetB);
+            FLOAT8 out = OPERATOR;
+            vstore8(out, 0, output + offsetC);
+        }
+        #endif
+        #if VEC_H == 4
+        {
+            offsetA += strideA;
+            offsetB += strideB;
+            offsetC += strideC;
+            FLOAT8 in0 = vload8(0, input0 + offsetA);
+            FLOAT8 in1 = vload8(0, input1 + offsetB);
+            FLOAT8 out = OPERATOR;
+            vstore8(out, 0, output + offsetC);
+        }
+        {
+            offsetA += strideA;
+            offsetB += strideB;
+            offsetC += strideC;
+            FLOAT8 in0 = vload8(0, input0 + offsetA);
+            FLOAT8 in1 = vload8(0, input1 + offsetB);
+            FLOAT8 out = OPERATOR;
+            vstore8(out, 0, output + offsetC);
+        }
+        #endif
+    }
+}
diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp
index f83de1223..d5315ffee 100644
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@@ -93,7 +93,7 @@ ConvExecution::ConvExecution(const std::vector<Tensor *> &inputs, const std::vec
     
     std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
     if (nullptr != conv2dParams->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2dParams, backend, true);
+        quanCommon = ConvolutionCommon::load(op, backend, true);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str());
         }
diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
index 830e6737d..f40f3d644 100644
--- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
@@ -12,7 +12,7 @@ namespace OpenCL {
 
 // set mDequantScale mDequantOffset mNumQuantBit mFilterDataPtr from mConv2dParams
 void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
-    quanCommon = ConvolutionCommon::load(mResource->mConv2dParams, this->backend(), false, true);
+    quanCommon = ConvolutionCommon::load(mOp, this->backend(), false, true);
     if (mResource->mConv2dParams->quanParameter() != nullptr) {
         mLowMemoryFlag = true;
     } else {
@@ -24,6 +24,7 @@ void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<ConvolutionC
     // set mNumQuantBit
     if(quanCommon->canUseInt4){
         mNumQuantBit = 4;
+        mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     }else{
         mNumQuantBit = 8;
     }
@@ -71,58 +72,100 @@ void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<ConvolutionC
     // set mFilterDataPtr
     mFilterDataPtr = (void *)quanCommon->weight.get();
 }
+
+bool ConvLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, int icPack, int ocPack) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("start convertToQuantWeight1x1Buffer !\n");
+#endif
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    std::string kernelName = "conv2d_1x1_ic_oc_weight_quant_buffer";
+    std::set<std::string> buildOptions;
+    if (mNumQuantBit == 8) {
+        buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT8");
+    } else if (mNumQuantBit == 4){
+        // int4 case
+        buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT4");
+    } else {/* More types to be supported. */}
+    if(mResource->mInputChannel % icPack != 0){
+        buildOptions.emplace("-DCHANNEL_LEAVE");
+    }
+
+    mBufferToConv1x1Kernel = runtime->buildKernelWithCache("buffer_convert_quant", kernelName, buildOptions);
+    auto kernel = mBufferToConv1x1Kernel->get();
+    uint32_t gws[2] = {static_cast<uint32_t>(UP_DIV(mResource->mInputChannel, icPack)), static_cast<uint32_t>(UP_DIV(mResource->mOutputChannel, ocPack))};
+
+    uint32_t idx = 0;
+    cl_int ret = CL_SUCCESS;
+    ret |= kernel.setArg(idx++, gws[0]);
+    ret |= kernel.setArg(idx++, gws[1]);
+    ret |= kernel.setArg(idx++, input);
+    ret |= kernel.setArg(idx++, *mResource->mKernelBuffer.get());
+    ret |= kernel.setArg(idx++, mResource->mInputChannel);
+    ret |= kernel.setArg(idx++, mResource->mOutputChannel);
+    ret |= kernel.setArg(idx++, icPack);
+    ret |= kernel.setArg(idx++, ocPack);
+    MNN_CHECK_CL_SUCCESS(ret, "setArg convertToQuantWeight1x1Buffer");
+
+    const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mBufferToConv1x1Kernel));
+    const std::vector<uint32_t> lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16)};
+
+    cl::Event event;
+    cl_int res;
+
+    std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+        roundUpGroupWorkSize[i] = ROUND_UP(gws[i], lws[i]);
+    }
+
+    res = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
+                                                         cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]),
+                                                         cl::NDRange(lws[0], lws[1]), nullptr, &event);
+
+    event.wait();
+    MNN_CHECK_CL_SUCCESS(res, "convertToQuantWeight1x1Buffer");
+
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end convertToQuantWeight1x1Buffer !\n");
+#endif
+    return true;
+}
+
 // set mKernelBuffer for the 1x1 kernels
 void ConvLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
     cl_int res;
-    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, 8)/*Cout pack set to max 8*/, ROUND_UP(mResource->mInputChannel, packCin), mResource->mKernelWidth, mResource->mKernelHeight}));
+    std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, packCout)/*Cout pack set to max 8*/, ROUND_UP(mResource->mInputChannel, packCin), 1, 1}));
     size_t buffer_size = filterBuffer->usize() / sizeof(float);
+    size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel;
     float *dequantAlpha = quanCommon->alpha.get();
     // shared part for all cases
-    if (mNumQuantBit == 8) {
-        // int8 case
-        buffer_size *= sizeof(int8_t);
-    } else if (mNumQuantBit == 4){
+    if (mNumQuantBit == 4){
         // int4 case
         buffer_size /= 2;
+        cpy_size = UP_DIV(cpy_size, 2);
     } else {/* More types to be supported. */}
-    mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
-    auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mResource->mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
-    if(kernelBufferPtr != nullptr && res == CL_SUCCESS){
-        ::memset(kernelBufferPtr, 0, buffer_size);
-
-        
-        for(int o = 0; o < mResource->mOutputChannel; o++){
-            int i = 0;
-            for(; i < mResource->mInputChannel; i++){
-                int bufferIdx = (o/packCout) * packCin*packCout + (i/packCin)*packCin*ROUND_UP(mResource->mOutputChannel, packCout) + (i%packCin)*packCout + (o%packCout);//(Ci/packCin， Co/packCout, packCin， packCout)
-                int filterIdx = o*mResource->mInputChannel + i;
-                if (mNumQuantBit == 8) {
-                    // int8 case
-                    ((int8_t *)kernelBufferPtr)[bufferIdx] = (int8_t)(((int8_t *)filterDataPtr)[filterIdx]);
-                } else if (mNumQuantBit == 4){
-                    // int4 case
-                    if (bufferIdx % 2 == 0) {
-                        ((uint8_t *)kernelBufferPtr)[bufferIdx / 2] += (uint8_t)((((int8_t *)filterDataPtr)[filterIdx] + 8) * 16);
-                    } else {
-                        ((uint8_t *)kernelBufferPtr)[bufferIdx / 2] += (uint8_t)(((int8_t *)filterDataPtr)[filterIdx] + 8);
-                    }
-                } else {/* More types to be supported. */}
-            }
-        }
+    cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
+    void *mapPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
+    if(mapPtr != nullptr && res == CL_SUCCESS){
+        ::memcpy(mapPtr, filterDataPtr, cpy_size);
     } else {
         MNN_ERROR("set1x1WeightLowMemory: Map error ptrCL == nullptr \n");
         MNN_ASSERT(false);
     }
-    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mResource->mKernelBuffer.get()), kernelBufferPtr);
+    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, mapPtr);
+    
+    mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
+    convertToQuantWeight1x1Buffer(filterBufferCL, packCin, packCout);
 }
 // set mFilter for the general kernels
 void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon) {
     if (filterDataPtr != nullptr) {
-        std::vector<int> filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)};
-        std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight}));
-        // int buffer_size = filterBuffer->elementSize();
+        std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, 4), mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight}));
         size_t buffer_size = filterBuffer->usize() / sizeof(float);
-        buffer_size *= sizeof(int8_t);
+        size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight;
+        if (mNumQuantBit == 4){
+            buffer_size /= 2;
+            cpy_size = UP_DIV(cpy_size, 2);
+        }
         cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
         filterBuffer->buffer().device = (uint64_t)(&filterBufferCL);
         float *dequantAlpha = quanCommon->alpha.get();
@@ -130,14 +173,7 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std:
         cl_int res;
         auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
         if(ptrCL != nullptr && res == CL_SUCCESS) {
-            ::memset(ptrCL, 0, buffer_size);
-            const int copy_size = mResource->mKernelWidth * mResource->mKernelHeight * sizeof(int8_t);
-            for(int oc=0; oc<mResource->mOutputChannel; oc++) {
-                int ic = 0;
-                for(; ic<mResource->mInputChannel; ic++) {
-                    ::memcpy((int8_t *)ptrCL + (oc * ROUND_UP(mResource->mInputChannel, 4) + ic) * mResource->mKernelWidth * mResource->mKernelHeight, ((int8_t *)filterDataPtr) + (oc * mResource->mInputChannel + ic) * mResource->mKernelWidth * mResource->mKernelHeight, copy_size);
-                }
-            }
+            ::memcpy(ptrCL, filterDataPtr, cpy_size);
         } else {
             MNN_ERROR("setGeneralWeightLowMemory: Map error ptrCL == nullptr \n");
         }
@@ -145,7 +181,7 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std:
         // convert to NC4HW4
         if (mNumQuantBit == 8) {
             // ROUND_UP(IC, 4), UP_DIV(OC, 4) * mKernelWidth * mKernelHeight
-            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
+            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 4 * ROUND_UP(mResource->mInputChannel, 4)}));
             mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
             mResource->mFilter->buffer().device = (uint64_t)(mResource->mKernelBuffer.get());
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
@@ -156,8 +192,8 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std:
             // For int4 case, data stored in mFilter should be uint8_t
             // while "Tensor::createDevice<uint8_t>" occupies more memory than "Tensor::createDevice<int8_t>".
             // Therefore, we use "Tensor::createDevice<int8_t>" currently, leaving "Tensor::createDevice<uint8_t>" to be supported.
-            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, filterImageShape[1], 1, 2 * filterImageShape[0]}));
-            mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size/2));
+            mResource->mFilter.reset(Tensor::createDevice<int8_t>({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 2 * ROUND_UP(mResource->mInputChannel, 4)}));
+            mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
             mResource->mFilter->buffer().device = (uint64_t)(mResource->mKernelBuffer.get());
             MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()};
             // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight}
diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp
index af6d7d897..b7e22c41f 100644
--- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp
+++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp
@@ -30,6 +30,7 @@ class ConvLowMemoryExecution : public ConvCommonExecution, public CommonExecutio
     void tune1x1CaseLowMemory(Tensor * input, Tensor * output);
     void tuneGeneralCaseLowMemory(Tensor * input, Tensor * output);
     void tuneGemmLowMemory(Tensor * input, Tensor * output);
+    bool convertToQuantWeight1x1Buffer(cl::Buffer input, int icPack, int ocPack);
     std::vector<int> mPaddings{0, 0};
     std::vector<uint32_t> mGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mLocalWorkSize{1, 1, 1, 1};
@@ -37,6 +38,7 @@ class ConvLowMemoryExecution : public ConvCommonExecution, public CommonExecutio
     void *mFilterDataPtr = nullptr;
     bool mLowMemoryFlag = false;
     int mNumQuantBit = 0;
+    std::shared_ptr<KernelWrap> mBufferToConv1x1Kernel = nullptr;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/image/ConvWinograd.cpp b/source/backend/opencl/execution/image/ConvWinograd.cpp
index 25f793286..b6b7e2442 100644
--- a/source/backend/opencl/execution/image/ConvWinograd.cpp
+++ b/source/backend/opencl/execution/image/ConvWinograd.cpp
@@ -68,7 +68,7 @@ ConvWinograd::ConvWinograd(const MNN::Op *op, Backend* backend) : CommonExecutio
 
     std::shared_ptr<MNN::ConvolutionCommon::Int8Common> quanCommon;
     if (nullptr != conv2D->quanParameter()) {
-        quanCommon = ConvolutionCommon::load(conv2D, backend, true);
+        quanCommon = ConvolutionCommon::load(op, backend, true);
         if (nullptr == quanCommon) {
             MNN_ERROR("Memory not Enough, can't extract IDST Convolution \n");
         }
diff --git a/source/backend/opencl/execution/image/DeconvExecution.cpp b/source/backend/opencl/execution/image/DeconvExecution.cpp
index b16abb3ec..d9ee162b1 100644
--- a/source/backend/opencl/execution/image/DeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DeconvExecution.cpp
@@ -28,7 +28,7 @@ DeconvExecution::DeconvExecution(const std::vector<Tensor *> &inputs, const MNN:
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize);
 
     int inputChannel  = weightSize / (kernelWidth * kernelHeight * outputChannel);
     std::vector<int> filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth};
diff --git a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
index dbdbd1bbd..d81a67d78 100644
--- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp
@@ -37,7 +37,7 @@ DepthwiseConvExecution::DepthwiseConvExecution(const std::vector<Tensor *> &inpu
     const float* filterDataPtr = nullptr;
     int filterDataSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize);
 
     mResource->mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
     std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
diff --git a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
index da85385a7..0ddaf7f0a 100644
--- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
+++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp
@@ -33,7 +33,7 @@ DepthwiseDeconvExecution::DepthwiseDeconvExecution(const std::vector<Tensor *> &
     const float* filterDataPtr = nullptr;
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &tempWeightSize);
 
         mResource->mFilter.reset(Tensor::createDevice<float>({1, filterImageShape[1], 1, 4 * filterImageShape[0]}));
     std::shared_ptr<Tensor> filterBuffer(Tensor::createDevice<float>(filterShape));
diff --git a/source/backend/opencl/schema/CLCache.fbs b/source/backend/opencl/schema/CLCache.fbs
index d53ce263f..958bbbb0c 100644
--- a/source/backend/opencl/schema/CLCache.fbs
+++ b/source/backend/opencl/schema/CLCache.fbs
@@ -29,11 +29,17 @@ table GemmInfo {
     paramInfo:[uint];
 }
 
+table PreParamInfo{
+    preParamName:string;
+    preParamData:uint;
+}
+
 table Cache {
     programs:[Shader];
     tunings:[Autotuning];
     tuned:[OpInfo];
     gemm:[GemmInfo];
+    preParam:[PreParamInfo];
 }
 
 root_type Cache;
diff --git a/source/backend/opencl/schema/current/CLCache_generated.h b/source/backend/opencl/schema/current/CLCache_generated.h
index 5918d6049..1fbe47226 100644
--- a/source/backend/opencl/schema/current/CLCache_generated.h
+++ b/source/backend/opencl/schema/current/CLCache_generated.h
@@ -23,6 +23,9 @@ struct AutotuningT;
 struct GemmInfo;
 struct GemmInfoT;
 
+struct PreParamInfo;
+struct PreParamInfoT;
+
 struct Cache;
 struct CacheT;
 
@@ -36,6 +39,8 @@ inline const flatbuffers::TypeTable *AutotuningTypeTable();
 
 inline const flatbuffers::TypeTable *GemmInfoTypeTable();
 
+inline const flatbuffers::TypeTable *PreParamInfoTypeTable();
+
 inline const flatbuffers::TypeTable *CacheTypeTable();
 
 struct TensorInfoT : public flatbuffers::NativeTable {
@@ -420,12 +425,78 @@ inline flatbuffers::Offset<GemmInfo> CreateGemmInfo(
 
 flatbuffers::Offset<GemmInfo> CreateGemmInfo(flatbuffers::FlatBufferBuilder &_fbb, const GemmInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct PreParamInfoT : public flatbuffers::NativeTable {
+  typedef PreParamInfo TableType;
+  std::string preParamName;
+  uint32_t preParamData;
+  PreParamInfoT()
+      : preParamData(0) {
+  }
+};
+
+struct PreParamInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PreParamInfoT NativeTableType;
+  static const flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PreParamInfoTypeTable();
+  }
+  const flatbuffers::String *preParamName() const {
+    return GetPointer<const flatbuffers::String *>(4);
+  }
+  uint32_t preParamData() const {
+    return GetField<uint32_t>(6, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, 4) &&
+           verifier.VerifyString(preParamName()) &&
+           VerifyField<uint32_t>(verifier, 6) &&
+           verifier.EndTable();
+  }
+  PreParamInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PreParamInfoT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PreParamInfo> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PreParamInfoBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_preParamName(flatbuffers::Offset<flatbuffers::String> preParamName) {
+    fbb_.AddOffset(4, preParamName);
+  }
+  void add_preParamData(uint32_t preParamData) {
+    fbb_.AddElement<uint32_t>(6, preParamData, 0);
+  }
+  explicit PreParamInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  PreParamInfoBuilder &operator=(const PreParamInfoBuilder &);
+  flatbuffers::Offset<PreParamInfo> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PreParamInfo>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PreParamInfo> CreatePreParamInfo(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> preParamName = 0,
+    uint32_t preParamData = 0) {
+  PreParamInfoBuilder builder_(_fbb);
+  builder_.add_preParamData(preParamData);
+  builder_.add_preParamName(preParamName);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PreParamInfo> CreatePreParamInfo(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct CacheT : public flatbuffers::NativeTable {
   typedef Cache TableType;
   std::vector<std::unique_ptr<ShaderT>> programs;
   std::vector<std::unique_ptr<AutotuningT>> tunings;
   std::vector<std::unique_ptr<OpInfoT>> tuned;
   std::vector<std::unique_ptr<GemmInfoT>> gemm;
+  std::vector<std::unique_ptr<PreParamInfoT>> preParam;
   CacheT() {
   }
 };
@@ -447,6 +518,9 @@ struct Cache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<GemmInfo>> *gemm() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<GemmInfo>> *>(10);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<PreParamInfo>> *preParam() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<PreParamInfo>> *>(12);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, 4) &&
@@ -461,6 +535,9 @@ struct Cache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, 10) &&
            verifier.VerifyVector(gemm()) &&
            verifier.VerifyVectorOfTables(gemm()) &&
+           VerifyOffset(verifier, 12) &&
+           verifier.VerifyVector(preParam()) &&
+           verifier.VerifyVectorOfTables(preParam()) &&
            verifier.EndTable();
   }
   CacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -483,6 +560,9 @@ struct CacheBuilder {
   void add_gemm(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<GemmInfo>>> gemm) {
     fbb_.AddOffset(10, gemm);
   }
+  void add_preParam(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<PreParamInfo>>> preParam) {
+    fbb_.AddOffset(12, preParam);
+  }
   explicit CacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -500,8 +580,10 @@ inline flatbuffers::Offset<Cache> CreateCache(
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Shader>>> programs = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Autotuning>>> tunings = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OpInfo>>> tuned = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<GemmInfo>>> gemm = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<GemmInfo>>> gemm = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<PreParamInfo>>> preParam = 0) {
   CacheBuilder builder_(_fbb);
+  builder_.add_preParam(preParam);
   builder_.add_gemm(gemm);
   builder_.add_tuned(tuned);
   builder_.add_tunings(tunings);
@@ -671,6 +753,35 @@ inline flatbuffers::Offset<GemmInfo> CreateGemmInfo(flatbuffers::FlatBufferBuild
       _paramInfo);
 }
 
+inline PreParamInfoT *PreParamInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new PreParamInfoT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void PreParamInfo::UnPackTo(PreParamInfoT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = preParamName(); if (_e) _o->preParamName = _e->str(); };
+  { auto _e = preParamData(); _o->preParamData = _e; };
+}
+
+inline flatbuffers::Offset<PreParamInfo> PreParamInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePreParamInfo(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PreParamInfo> CreatePreParamInfo(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PreParamInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _preParamName = _o->preParamName.empty() ? 0 : _fbb.CreateString(_o->preParamName);
+  auto _preParamData = _o->preParamData;
+  return CLCache::CreatePreParamInfo(
+      _fbb,
+      _preParamName,
+      _preParamData);
+}
+
 inline CacheT *Cache::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new CacheT();
   UnPackTo(_o, _resolver);
@@ -684,6 +795,7 @@ inline void Cache::UnPackTo(CacheT *_o, const flatbuffers::resolver_function_t *
   { auto _e = tunings(); if (_e) { _o->tunings.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tunings[_i] = std::unique_ptr<AutotuningT>(_e->Get(_i)->UnPack(_resolver)); } } };
   { auto _e = tuned(); if (_e) { _o->tuned.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tuned[_i] = std::unique_ptr<OpInfoT>(_e->Get(_i)->UnPack(_resolver)); } } };
   { auto _e = gemm(); if (_e) { _o->gemm.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->gemm[_i] = std::unique_ptr<GemmInfoT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  { auto _e = preParam(); if (_e) { _o->preParam.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->preParam[_i] = std::unique_ptr<PreParamInfoT>(_e->Get(_i)->UnPack(_resolver)); } } };
 }
 
 inline flatbuffers::Offset<Cache> Cache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -698,12 +810,14 @@ inline flatbuffers::Offset<Cache> CreateCache(flatbuffers::FlatBufferBuilder &_f
   auto _tunings = _o->tunings.size() ? _fbb.CreateVector<flatbuffers::Offset<Autotuning>> (_o->tunings.size(), [](size_t i, _VectorArgs *__va) { return CreateAutotuning(*__va->__fbb, __va->__o->tunings[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _tuned = _o->tuned.size() ? _fbb.CreateVector<flatbuffers::Offset<OpInfo>> (_o->tuned.size(), [](size_t i, _VectorArgs *__va) { return CreateOpInfo(*__va->__fbb, __va->__o->tuned[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _gemm = _o->gemm.size() ? _fbb.CreateVector<flatbuffers::Offset<GemmInfo>> (_o->gemm.size(), [](size_t i, _VectorArgs *__va) { return CreateGemmInfo(*__va->__fbb, __va->__o->gemm[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _preParam = _o->preParam.size() ? _fbb.CreateVector<flatbuffers::Offset<PreParamInfo>> (_o->preParam.size(), [](size_t i, _VectorArgs *__va) { return CreatePreParamInfo(*__va->__fbb, __va->__o->preParam[i].get(), __va->__rehasher); }, &_va ) : 0;
   return CLCache::CreateCache(
       _fbb,
       _programs,
       _tunings,
       _tuned,
-      _gemm);
+      _gemm,
+      _preParam);
 }
 
 inline const flatbuffers::TypeTable *TensorInfoTypeTable() {
@@ -794,27 +908,45 @@ inline const flatbuffers::TypeTable *GemmInfoTypeTable() {
   return &tt;
 }
 
+inline const flatbuffers::TypeTable *PreParamInfoTypeTable() {
+  static const flatbuffers::TypeCode type_codes[] = {
+    { flatbuffers::ET_STRING, 0, -1 },
+    { flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "preParamName",
+    "preParamData"
+  };
+  static const flatbuffers::TypeTable tt = {
+    flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
 inline const flatbuffers::TypeTable *CacheTypeTable() {
   static const flatbuffers::TypeCode type_codes[] = {
     { flatbuffers::ET_SEQUENCE, 1, 0 },
     { flatbuffers::ET_SEQUENCE, 1, 1 },
     { flatbuffers::ET_SEQUENCE, 1, 2 },
-    { flatbuffers::ET_SEQUENCE, 1, 3 }
+    { flatbuffers::ET_SEQUENCE, 1, 3 },
+    { flatbuffers::ET_SEQUENCE, 1, 4 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     ShaderTypeTable,
     AutotuningTypeTable,
     OpInfoTypeTable,
-    GemmInfoTypeTable
+    GemmInfoTypeTable,
+    PreParamInfoTypeTable
   };
   static const char * const names[] = {
     "programs",
     "tunings",
     "tuned",
-    "gemm"
+    "gemm",
+    "preParam"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
diff --git a/source/backend/tensorrt/execution/TRTConvolution.cpp b/source/backend/tensorrt/execution/TRTConvolution.cpp
index 109b24cb1..831dae85a 100644
--- a/source/backend/tensorrt/execution/TRTConvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTConvolution.cpp
@@ -34,7 +34,7 @@ std::vector<ITensor *> TRTConvolution::onEncode(const std::vector<ITensor *> &xO
     int weightSize      = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight;
     if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) {
-        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true);
+        quanWeight = ConvolutionCommon::load(mOp, backend(), true);
         srcCount   = quanWeight->weightFloat.size() / (outputCount * kernelX * kernelY);
         source     = quanWeight->weightFloat.get();
         weightSize = quanWeight->weightFloat.size();
diff --git a/source/backend/tensorrt/execution/TRTDeconvolution.cpp b/source/backend/tensorrt/execution/TRTDeconvolution.cpp
index 835a17b86..45b7d3a4c 100755
--- a/source/backend/tensorrt/execution/TRTDeconvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDeconvolution.cpp
@@ -35,7 +35,7 @@ std::vector<ITensor *> TRTDeconvolution::onEncode(const std::vector<ITensor *> &
     int weightSize      = 0;
 
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), mOp, &source, &weightSize);
 
     nvinfer1::DimsHW NVKSize(kernelY, kernelX);
     nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX());
@@ -56,7 +56,7 @@ std::vector<ITensor *> TRTDeconvolution::onEncode(const std::vector<ITensor *> &
 
     if (conv2DCommon->padMode() == PadMode_SAME) {
         conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
-    }    
+    }
     conv_layer->setName(mOp->name()->str().c_str());
     auto relu  = conv2DCommon->relu();
     auto relu6 = conv2DCommon->relu6();
diff --git a/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp b/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp
index e2beeb066..194bc0cad 100644
--- a/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp
@@ -36,7 +36,7 @@ std::vector<ITensor *> TRTDepthwiseConvolution::onEncode(const std::vector<ITens
     int weightSize      = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanWeight;
     if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) {
-        quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true);
+        quanWeight = ConvolutionCommon::load(mOp, backend(), true);
         source     = quanWeight->weightFloat.get();
         weightSize = quanWeight->weightFloat.size();
     } else {
@@ -61,7 +61,7 @@ std::vector<ITensor *> TRTDepthwiseConvolution::onEncode(const std::vector<ITens
     conv_layer->setPadding(nvinfer1::DimsHW{pads.second, pads.first});
     if (conv2DCommon->padMode() == PadMode_SAME) {
         conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
-    }    
+    }
     conv_layer->setName(mOp->name()->str().c_str());
     auto relu  = conv2DCommon->relu();
     auto relu6 = conv2DCommon->relu6();
diff --git a/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp b/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp
index fe620a824..4e0fae803 100755
--- a/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp
+++ b/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp
@@ -33,9 +33,9 @@ std::vector<ITensor *> TRTDepthwiseDeconvolution::onEncode(const std::vector<ITe
     auto outputCount    = conv2DCommon->outputCount();
     const float *source = nullptr;
     int weightSize      = 0;
-    
+
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, backend(), mOp, &source, &weightSize);
 
     nvinfer1::DimsHW NVKSize(kernelY, kernelX);
     nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX());
@@ -56,7 +56,7 @@ std::vector<ITensor *> TRTDepthwiseDeconvolution::onEncode(const std::vector<ITe
 
     if (conv2DCommon->padMode() == PadMode_SAME) {
         conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
-    }    
+    }
     conv_layer->setName(mOp->name()->str().c_str());
     auto relu  = conv2DCommon->relu();
     auto relu6 = conv2DCommon->relu6();
diff --git a/source/backend/vulkan/CMakeLists.txt b/source/backend/vulkan/CMakeLists.txt
index 9504a1ec2..555738aa7 100644
--- a/source/backend/vulkan/CMakeLists.txt
+++ b/source/backend/vulkan/CMakeLists.txt
@@ -43,5 +43,5 @@ else()
 endif()
 
 if (CMAKE_SYSTEM_NAME MATCHES "^Android")
-  add_definitions(-DVK_USE_PLATFORM_ANDROID_KHR)
+    add_definitions(-DVK_USE_PLATFORM_ANDROID_KHR)
 endif()
diff --git a/source/backend/vulkan/buffer/backend/VulkanBackend.cpp b/source/backend/vulkan/buffer/backend/VulkanBackend.cpp
index 7ddb8c713..34f992374 100644
--- a/source/backend/vulkan/buffer/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/buffer/backend/VulkanBackend.cpp
@@ -17,9 +17,6 @@
 #include "execution/VulkanBasicExecution.hpp"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
-#ifdef MNN_USE_NEON
-#include <arm_neon.h>
-#endif
 #define MNN_OP_SUPPORT_LOG
 //#define MNN_VULKAN_DUMP_MEMORY_USAGE
 
@@ -417,7 +414,7 @@ std::vector<uint32_t> VulkanBackend::autoTunePipeline(const VulkanPipeline* pipe
     mRuntime->mDevice->getMaxComputeWorkGroupSize(maxGroups);
     
     std::vector<uint32_t> lws_prefer(3, 1);
-    uint32_t min_cost = UINT_MAX;
+    float min_cost = -1.0f;
     
     while(lws[2] <= gws[2] && lws[2] <= maxGroups[2]) {
         lws[1] = 1;
@@ -430,8 +427,8 @@ std::vector<uint32_t> VulkanBackend::autoTunePipeline(const VulkanPipeline* pipe
                     groupSize[2] = UP_DIV(gws[2], lws[2]);
                     
                     pipeline->changePipeline(lws);
-                    int cost_time = (int)getPipelineTime(pipeline, des, groupSize);
-                    if(cost_time < min_cost) {
+                    auto cost_time = getPipelineTime(pipeline, des, groupSize);
+                    if(cost_time < min_cost || min_cost < 0.0f) {
                         min_cost = cost_time;
                         lws_prefer[0] = lws[0];
                         lws_prefer[1] = lws[1];
diff --git a/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp b/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp
index 87b1c85bf..ee8985122 100644
--- a/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp
@@ -341,7 +341,7 @@ class VulkanConvolutionSlideWindowsInt8 : public VulkanConvolutionCommon {
             }
         }
         vkBn->copyToGPUBuffer(wscaleData.data(), res.mWeightScale->buffer(), ocC4 * 4 * 2 * sizeof(float), 0);
-        
+
         // Build Pipeline
         // Create Pipeline
         std::vector<VkDescriptorType> convTypes{
@@ -428,9 +428,9 @@ class VulkanConvolutionCreator : public VulkanBackend::Creator {
                 }
             }
             if (quan->buffer() && OpType_Convolution == op->type()) {
-                quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, false, true);
+                quanWeight = ConvolutionCommon::load(op, backend, false, true);
             } else {
-                quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true);
+                quanWeight = ConvolutionCommon::load(op, backend, true);
             }
             if (quanWeight->weight.get() != nullptr) {
                 useInt8Conv = true;
diff --git a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp
index a79ea9d7c..59372ff91 100644
--- a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp
@@ -11,13 +11,14 @@
 #include "core/TensorUtils.hpp"
 namespace MNN {
 static void _initKernelRegion() {
-    
+
 }
 VulkanDeconvolution::VulkanDeconvolution(Backend* bn) : VulkanBasicExecution(bn) {
     // Donthing
 }
 
-VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Convolution2D* conv, OpType type, bool multiInputs) {
+VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Op* op, OpType type, bool multiInputs) {
+    auto conv = op->main_as_Convolution2D();
     auto exeRes = new VulkanDeconvolution(bn);
     exeRes->mConvCommonOption = conv->common();
     auto vkBn         = (VulkanBackend*)bn;
@@ -45,7 +46,7 @@ VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Convolution2
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     if (!multiInputs) {
-        ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &tempWeight, &tempWeightSize);
+        ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &tempWeight, &tempWeightSize);
         MNN_ASSERT(nullptr != tempWeight);
         if (0 >= ci) {
             ci = tempWeightSize / co / kw / kh;
@@ -212,7 +213,7 @@ class VulkanDeconvolutionCreator : public VulkanBackend::Creator {
 public:
     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
                                 Backend* backend) const override {
-        return VulkanDeconvolution::create(backend, op->main_as_Convolution2D(), op->type(), inputs.size() > 1);
+        return VulkanDeconvolution::create(backend, op, op->type(), inputs.size() > 1);
     }
 };
 
diff --git a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp
index 97484a35f..97bcbe88f 100644
--- a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp
+++ b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp
@@ -17,7 +17,7 @@ class VulkanDeconvolution : public VulkanBasicExecution {
     virtual ~VulkanDeconvolution() {
     }
 
-    static VulkanDeconvolution* create(Backend* bn, const Convolution2D* conv, OpType type, bool multiInputs);
+    static VulkanDeconvolution* create(Backend* bn, const Op* op, OpType type, bool multiInputs);
     virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const VulkanCommandPool::Buffer* cmdBuffer) override;
 
diff --git a/source/backend/vulkan/buffer/execution/VulkanUnary.cpp b/source/backend/vulkan/buffer/execution/VulkanUnary.cpp
index de98e09ed..81d53c465 100644
--- a/source/backend/vulkan/buffer/execution/VulkanUnary.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanUnary.cpp
@@ -71,6 +71,8 @@ static std::string _getMidType(const Op* op) {
             SETTYPE(UnaryOpOperation_SQUARE, "SQUARE");
             SETTYPE(UnaryOpOperation_LOG, "LOG");
             SETTYPE(UnaryOpOperation_GELU, "GELU");
+            // Since SPIR-V lacks a built-in erf (gauss error function) instruction and the existing shader implementation of GELU is essentially an approximation of erf, there is no need to add a new implementation of GELU_STANDARD.
+            SETTYPE(UnaryOpOperation_GELU_STANDARD, "GELU");
 
             SETTYPE(UnaryOpOperation_TAN, "TAN");
             SETTYPE(UnaryOpOperation_COS, "COS");
diff --git a/source/backend/vulkan/component/VulkanDevice.cpp b/source/backend/vulkan/component/VulkanDevice.cpp
index 79175277a..06f1eb5f2 100644
--- a/source/backend/vulkan/component/VulkanDevice.cpp
+++ b/source/backend/vulkan/component/VulkanDevice.cpp
@@ -10,14 +10,13 @@
 #include <string.h>
 //#define MNN_VULKAN_PRINT_EXT
 namespace MNN {
-VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance, const std::vector<const char*>& device_extensions)
+VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance)
     : mOwner(true),
       mInstance(instance),
       mQueueFamilyIndex(0),
       mPhysicalDevice(VK_NULL_HANDLE),
       mDevice(VK_NULL_HANDLE),
       mQueue(VK_NULL_HANDLE) {
-    MNN_ASSERT(mInstance->success());
     // Find one GPU to use:
     // On Android, every GPU device is equal -- supporting
     // graphics/compute/present
@@ -68,6 +67,23 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance, const std::
     mDeviceFeature.shaderStorageImageWriteWithoutFormat = VK_TRUE;
     //vkGetPhysicalDeviceFeatures(mPhysicalDevice, &mDeviceFeature);
 
+    // Set device extensions.
+    std::vector<const char*> deviceExtensions;
+    std::vector<const char*> deviceExtensionsToCheck = {
+        "VK_KHR_portability_subset"
+    };
+    uint32_t availableDeviceExtensionCount = 0;
+    CALL_VK(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr, &availableDeviceExtensionCount, nullptr));
+    std::vector<VkExtensionProperties> availableDeviceExtensions(availableDeviceExtensionCount);
+    CALL_VK(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr, &availableDeviceExtensionCount, availableDeviceExtensions.data()));
+    for (uint32_t i = 0; i < availableDeviceExtensionCount; i++) {
+        for (uint32_t j = 0; j < deviceExtensionsToCheck.size(); j++) {
+            if (strcmp(availableDeviceExtensions[i].extensionName, deviceExtensionsToCheck[j]) == 0) {
+                deviceExtensions.push_back(deviceExtensionsToCheck[j]);
+            }
+        }
+    }
+
     VkDeviceCreateInfo deviceCreateInfo{
         /* .sType                   = */ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
         /* .pNext                   = */ nullptr,
@@ -76,11 +92,16 @@ VulkanDevice::VulkanDevice(std::shared_ptr<VulkanInstance> instance, const std::
         /* .pQueueCreateInfos       = */ &queueCreateInfo,
         /* .enabledLayerCount       = */ 0,
         /* .ppEnabledLayerNames     = */ nullptr,
-        /* .enabledExtensionCount   = */ static_cast<uint32_t>(device_extensions.size()),
-        /* .ppEnabledExtensionNames = */ device_extensions.data(),
+        /* .enabledExtensionCount   = */ static_cast<uint32_t>(deviceExtensions.size()),
+        /* .ppEnabledExtensionNames = */ deviceExtensions.data(),
         /* .pEnabledFeatures        = */ &mDeviceFeature,
     };
+    mDevice = VK_NULL_HANDLE;
     CALL_VK(vkCreateDevice(mPhysicalDevice, &deviceCreateInfo, nullptr, &mDevice));
+    if (VK_NULL_HANDLE == mDevice) {
+        MNN_ERROR("Can't create vk device\n");
+        return;
+    }
     vkGetPhysicalDeviceProperties(mPhysicalDevice, &mDeviceProty);
     vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mMemoryProty);
     getDeviceQueue(mQueueFamilyIndex, 0, mQueue);
diff --git a/source/backend/vulkan/component/VulkanDevice.hpp b/source/backend/vulkan/component/VulkanDevice.hpp
index 32a9d42f8..7eae18dca 100644
--- a/source/backend/vulkan/component/VulkanDevice.hpp
+++ b/source/backend/vulkan/component/VulkanDevice.hpp
@@ -18,8 +18,7 @@
 namespace MNN {
 class VulkanDevice : public NonCopyable {
 public:
-    explicit VulkanDevice(std::shared_ptr<VulkanInstance> instance,
-                          const std::vector<const char*>& device_extensions = {});
+    explicit VulkanDevice(std::shared_ptr<VulkanInstance> instance);
     explicit VulkanDevice(std::shared_ptr<VulkanInstance> instance, VkPhysicalDevice physicalDevice, VkDevice device,
                           uint32_t queueFamilyIndex, VkQueue queue);
     virtual ~VulkanDevice();
diff --git a/source/backend/vulkan/component/VulkanInstance.cpp b/source/backend/vulkan/component/VulkanInstance.cpp
index db2e5b2c7..4cee805fe 100644
--- a/source/backend/vulkan/component/VulkanInstance.cpp
+++ b/source/backend/vulkan/component/VulkanInstance.cpp
@@ -8,6 +8,7 @@
 
 #include "backend/vulkan/component/VulkanInstance.hpp"
 #include <vector>
+#include <algorithm>
 
 namespace MNN {
 VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) {
@@ -20,17 +21,42 @@ VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) {
         /* .engineVersion      = */ VK_MAKE_VERSION(1, 0, 0),
         /* .apiVersion         = */ VK_MAKE_VERSION(1, 0, 0),
     };
-    std::vector<const char*> instance_extensions;
+
+    // Set instance extensions.
+    std::vector<const char*> instanceExtensions;
+    std::vector<const char*> instanceExtensionsToCheck = {
+        VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME,
+        VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME
+    };
+    uint32_t availableInstanceExtensionCount = 0;
+    CALL_VK(vkEnumerateInstanceExtensionProperties(nullptr, &availableInstanceExtensionCount, nullptr));
+    std::vector<VkExtensionProperties> availableInstanceExtensions(availableInstanceExtensionCount);
+    CALL_VK(vkEnumerateInstanceExtensionProperties(nullptr, &availableInstanceExtensionCount, availableInstanceExtensions.data()));
+    for (uint32_t i = 0; i < availableInstanceExtensionCount; i++) {
+        for (uint32_t j = 0; j < instanceExtensionsToCheck.size(); j++) {
+            if (strcmp(availableInstanceExtensions[i].extensionName, instanceExtensionsToCheck[j]) == 0) {
+                instanceExtensions.push_back(instanceExtensionsToCheck[j]);
+            }
+        }
+    }
+
+    // Set instanceCreateFlag.
+    auto it = std::find_if(instanceExtensions.begin(), instanceExtensions.end(),
+                        [](const char* str) { return strcmp(str, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME) == 0; });
+    VkInstanceCreateFlags instanceCreateFlag = (it != instanceExtensions.end()) ? VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR : 0;
+
 #ifdef MNN_VULKAN_DEBUG
+    MNN_PRINT("MNN_VULKAN_DEBUG is on.\n");
     const std::vector<const char*> validationLayers = {
         "VK_LAYER_KHRONOS_validation"
     };
 #endif
+
     // Create the Vulkan instance
     VkInstanceCreateInfo instanceCreateInfo{
         /* .sType                   = */ VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
         /* .pNext                   = */ nullptr,
-        /* .flags                   = */ 0,
+        /* .flags                   = */ instanceCreateFlag,
         /* .pApplicationInfo        = */ &appInfo,
 #ifdef MNN_VULKAN_DEBUG
         /* .enabledLayerCount       = */ 1,
@@ -39,8 +65,8 @@ VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) {
         /* .enabledLayerCount       = */ 0,
         /* .ppEnabledLayerNames     = */ nullptr,
 #endif
-        /* .enabledExtensionCount   = */ static_cast<uint32_t>(instance_extensions.size()),
-        /* .ppEnabledExtensionNames = */ instance_extensions.data(),
+        /* .enabledExtensionCount   = */ static_cast<uint32_t>(instanceExtensions.size()),
+        /* .ppEnabledExtensionNames = */ instanceExtensions.data(),
     };
     CALL_VK(vkCreateInstance(&instanceCreateInfo, nullptr, &mInstance));
 }
@@ -65,6 +91,9 @@ void VulkanInstance::getPhysicalDeviceQueueFamilyProperties(const VkPhysicalDevi
 }
 
 const bool VulkanInstance::supportVulkan() const {
+    if (VK_NULL_HANDLE == mInstance) {
+        return false;
+    }
     uint32_t gpuCount = 0;
     auto res          = enumeratePhysicalDevices(gpuCount, nullptr);
     if ((0 == gpuCount) || (VK_SUCCESS != res)) {
diff --git a/source/backend/vulkan/component/VulkanInstance.hpp b/source/backend/vulkan/component/VulkanInstance.hpp
index 4a6969896..cc57d0c88 100644
--- a/source/backend/vulkan/component/VulkanInstance.hpp
+++ b/source/backend/vulkan/component/VulkanInstance.hpp
@@ -28,11 +28,6 @@ class VulkanInstance : public NonCopyable {
     VkInstance get() const {
         return mInstance;
     }
-
-    bool success() const {
-        return (VK_NULL_HANDLE != mInstance);
-    }
-
 private:
     bool mOwner;
     VkInstance mInstance;
diff --git a/source/backend/vulkan/component/VulkanPipeline.cpp b/source/backend/vulkan/component/VulkanPipeline.cpp
index c73a6b8bf..e0da6bcdd 100644
--- a/source/backend/vulkan/component/VulkanPipeline.cpp
+++ b/source/backend/vulkan/component/VulkanPipeline.cpp
@@ -55,8 +55,9 @@ VulkanPipeline* VulkanPipelineFactory::createComputePipeline(const uint8_t* data
     VkPipeline pipeline;
     /*for localSize_x_id = 0,localSize_y_id = 1,localSize_z_id = 2*/
     std::vector<VkSpecializationMapEntry> specializationMapEntry; /*localSize data description*/
-    std::shared_ptr<VkSpecializationInfo> specializationInfo = std::make_shared<VkSpecializationInfo>();
+    std::shared_ptr<VkSpecializationInfo> specializationInfo;
     if (localSize.size() > 0) {
+        specializationInfo = std::make_shared<VkSpecializationInfo>();
         // FUNC_PRINT(localSize.size());
         for (int i = 0; i < localSize.size(); i++) {
             VkSpecializationMapEntry entry = {(uint32_t)(i), (uint32_t)(sizeof(uint32_t) * i),
diff --git a/source/backend/vulkan/component/VulkanQueryPool.cpp b/source/backend/vulkan/component/VulkanQueryPool.cpp
index 82ef40a41..7cbe6a928 100644
--- a/source/backend/vulkan/component/VulkanQueryPool.cpp
+++ b/source/backend/vulkan/component/VulkanQueryPool.cpp
@@ -35,7 +35,7 @@ float VulkanQueryPool::VulkanGetQueryPoolResults(){
     vkGetQueryPoolResults(mDevice.get(), queryPool, 0, 2, sizeof(uint64_t) * 2, timestamps, sizeof(uint64_t), VK_QUERY_RESULT_WAIT_BIT);
     
     float timestampPeriod = mDevice.getTimestampPeriod();
-    float executionTime = (timestamps[1] - timestamps[0]) * timestampPeriod * 1e-3f; // 微妙
+    float executionTime = (timestamps[1] - timestamps[0]) * timestampPeriod * 1e-3f; // us
     return executionTime;
 }
 } // namespace MNN
diff --git a/source/backend/vulkan/image/backend/VulkanBackend.cpp b/source/backend/vulkan/image/backend/VulkanBackend.cpp
index 0892a53b0..0663ceba6 100644
--- a/source/backend/vulkan/image/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp
@@ -22,7 +22,7 @@
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
-//#define MNN_OP_SUPPORT_LOG
+#define MNN_OP_SUPPORT_LOG
 //#define MNN_VULKAN_DUMP_MEMORY_USAGE
 #define MNN_VULKAN_MAX_CACHE_CONVSIZE 50
 namespace MNN {
@@ -89,9 +89,6 @@ const VulkanPipeline* VulkanBackend::getPipeline(const std::string& key, const s
 }
 
 bool VulkanBackend::_supportImageSize(const Tensor* MTensor) {
-    if (MTensor->getType().code != halide_type_float) {
-        return false;
-    }
     auto format = TensorUtils::getDescribe(MTensor)->dimensionFormat;
     if (format != MNN_DATA_FORMAT_NC4HW4) {
         return true;
diff --git a/source/backend/vulkan/image/compiler/AllShader.cpp b/source/backend/vulkan/image/compiler/AllShader.cpp
index 559166efc..e9b7860f1 100644
--- a/source/backend/vulkan/image/compiler/AllShader.cpp
+++ b/source/backend/vulkan/image/compiler/AllShader.cpp
@@ -229,18 +229,13 @@ const unsigned char glsl_deconvCol2Im_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x93, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x93, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x93, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x94, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -376,7 +371,7 @@ const unsigned char glsl_deconvCol2Im_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x5b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x5b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvCol2Im_comp_len = 2352;
+unsigned int glsl_deconvCol2Im_comp_len = 2292;
 
 const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -431,31 +426,19 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xc7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -703,7 +686,7 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwiseMali_comp_len = 3868;
+unsigned int glsl_convolutionDepthwiseMali_comp_len = 3724;
 
 const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -758,31 +741,19 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xcb, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcc, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -1036,7 +1007,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3940;
+unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3796;
 
 const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1091,31 +1062,19 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xcd, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xce, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -1373,7 +1332,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3988;
+unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3844;
 
 const unsigned char glsl_relu_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1408,19 +1367,13 @@ const unsigned char glsl_relu_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x50, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -1518,7 +1471,7 @@ const unsigned char glsl_relu_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_relu_comp_len = 1692;
+unsigned int glsl_relu_comp_len = 1620;
 
 const unsigned char glsl_unaryImage_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1556,7 +1509,6 @@ const unsigned char glsl_unaryImage_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x3f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x3f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x44, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -1634,7 +1586,7 @@ const unsigned char glsl_unaryImage_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_comp_len = 1344;
+unsigned int glsl_unaryImage_comp_len = 1332;
 
 const unsigned char glsl_unaryImage_SIGMOID_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1672,7 +1624,6 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x47, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x47, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x4c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -1761,7 +1712,7 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SIGMOID_comp_len = 1468;
+unsigned int glsl_unaryImage_SIGMOID_comp_len = 1456;
 
 const unsigned char glsl_unaryImage_TANH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1799,7 +1750,6 @@ const unsigned char glsl_unaryImage_TANH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -1879,7 +1829,7 @@ const unsigned char glsl_unaryImage_TANH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_TANH_comp_len = 1368;
+unsigned int glsl_unaryImage_TANH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ABS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -1917,7 +1867,6 @@ const unsigned char glsl_unaryImage_ABS_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -1997,7 +1946,7 @@ const unsigned char glsl_unaryImage_ABS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ABS_comp_len = 1368;
+unsigned int glsl_unaryImage_ABS_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_SQRT_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2035,7 +1984,6 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2115,7 +2063,7 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SQRT_comp_len = 1368;
+unsigned int glsl_unaryImage_SQRT_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_RSQRT_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2153,7 +2101,6 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2233,7 +2180,7 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_RSQRT_comp_len = 1368;
+unsigned int glsl_unaryImage_RSQRT_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_NEG_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2271,7 +2218,6 @@ const unsigned char glsl_unaryImage_NEG_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2351,7 +2297,7 @@ const unsigned char glsl_unaryImage_NEG_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_NEG_comp_len = 1360;
+unsigned int glsl_unaryImage_NEG_comp_len = 1348;
 
 const unsigned char glsl_unaryImage_SQUARE_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2389,7 +2335,6 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x42, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x42, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x47, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2469,7 +2414,7 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = {
   0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SQUARE_comp_len = 1364;
+unsigned int glsl_unaryImage_SQUARE_comp_len = 1352;
 
 const unsigned char glsl_unaryImage_EXP_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2507,7 +2452,6 @@ const unsigned char glsl_unaryImage_EXP_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2587,7 +2531,7 @@ const unsigned char glsl_unaryImage_EXP_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_EXP_comp_len = 1368;
+unsigned int glsl_unaryImage_EXP_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_SIGN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2625,7 +2569,6 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2705,7 +2648,7 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SIGN_comp_len = 1368;
+unsigned int glsl_unaryImage_SIGN_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_LOG_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2743,7 +2686,6 @@ const unsigned char glsl_unaryImage_LOG_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2829,7 +2771,7 @@ const unsigned char glsl_unaryImage_LOG_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_LOG_comp_len = 1440;
+unsigned int glsl_unaryImage_LOG_comp_len = 1428;
 
 const unsigned char glsl_unaryImage_TAN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2867,7 +2809,6 @@ const unsigned char glsl_unaryImage_TAN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -2947,7 +2888,7 @@ const unsigned char glsl_unaryImage_TAN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_TAN_comp_len = 1368;
+unsigned int glsl_unaryImage_TAN_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_COS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -2985,7 +2926,6 @@ const unsigned char glsl_unaryImage_COS_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3065,7 +3005,7 @@ const unsigned char glsl_unaryImage_COS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_COS_comp_len = 1368;
+unsigned int glsl_unaryImage_COS_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_SIN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3103,7 +3043,6 @@ const unsigned char glsl_unaryImage_SIN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3183,7 +3122,7 @@ const unsigned char glsl_unaryImage_SIN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SIN_comp_len = 1368;
+unsigned int glsl_unaryImage_SIN_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_CEIL_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3221,7 +3160,6 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3301,7 +3239,7 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_CEIL_comp_len = 1368;
+unsigned int glsl_unaryImage_CEIL_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_FLOOR_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3339,7 +3277,6 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3419,7 +3356,7 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_FLOOR_comp_len = 1368;
+unsigned int glsl_unaryImage_FLOOR_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_EXPM1_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3457,7 +3394,6 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3543,7 +3479,7 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_EXPM1_comp_len = 1432;
+unsigned int glsl_unaryImage_EXPM1_comp_len = 1420;
 
 const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3581,7 +3517,6 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x43, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x43, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x48, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3665,7 +3600,7 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1408;
+unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1396;
 
 const unsigned char glsl_unaryImage_SINH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3703,7 +3638,6 @@ const unsigned char glsl_unaryImage_SINH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3783,7 +3717,7 @@ const unsigned char glsl_unaryImage_SINH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_SINH_comp_len = 1368;
+unsigned int glsl_unaryImage_SINH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ASINH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3821,7 +3755,6 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -3901,7 +3834,7 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ASINH_comp_len = 1368;
+unsigned int glsl_unaryImage_ASINH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ASIN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -3939,7 +3872,6 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4019,7 +3951,7 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ASIN_comp_len = 1368;
+unsigned int glsl_unaryImage_ASIN_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_COSH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4057,7 +3989,6 @@ const unsigned char glsl_unaryImage_COSH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4137,7 +4068,7 @@ const unsigned char glsl_unaryImage_COSH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_COSH_comp_len = 1368;
+unsigned int glsl_unaryImage_COSH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ACOS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4175,7 +4106,6 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4255,7 +4185,7 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ACOS_comp_len = 1368;
+unsigned int glsl_unaryImage_ACOS_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ACOSH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4293,7 +4223,6 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4373,7 +4302,7 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ACOSH_comp_len = 1368;
+unsigned int glsl_unaryImage_ACOSH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ATAN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4411,7 +4340,6 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4491,7 +4419,7 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ATAN_comp_len = 1368;
+unsigned int glsl_unaryImage_ATAN_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_ATANH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4529,7 +4457,6 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4609,7 +4536,7 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ATANH_comp_len = 1368;
+unsigned int glsl_unaryImage_ATANH_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_LOG1P_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4647,7 +4574,6 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4733,7 +4659,7 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_LOG1P_comp_len = 1432;
+unsigned int glsl_unaryImage_LOG1P_comp_len = 1420;
 
 const unsigned char glsl_unaryImage_ROUND_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4771,7 +4697,6 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -4851,7 +4776,7 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_ROUND_comp_len = 1368;
+unsigned int glsl_unaryImage_ROUND_comp_len = 1356;
 
 const unsigned char glsl_unaryImage_HARDSWISH_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -4889,7 +4814,6 @@ const unsigned char glsl_unaryImage_HARDSWISH_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x60, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x60, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x65, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -5005,7 +4929,7 @@ const unsigned char glsl_unaryImage_HARDSWISH_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1800;
+unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1788;
 
 const unsigned char glsl_unaryImage_GELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -5043,7 +4967,6 @@ const unsigned char glsl_unaryImage_GELU_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x58, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x58, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x5d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -5151,7 +5074,7 @@ const unsigned char glsl_unaryImage_GELU_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unaryImage_GELU_comp_len = 1696;
+unsigned int glsl_unaryImage_GELU_comp_len = 1684;
 
 const unsigned char glsl_im2col_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -5204,19 +5127,13 @@ const unsigned char glsl_im2col_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0a, 0x01, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x0b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x1a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x1d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x1d, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x1d, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1d, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1e, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x27, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -5527,7 +5444,7 @@ const unsigned char glsl_im2col_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x66, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_im2col_comp_len = 4464;
+unsigned int glsl_im2col_comp_len = 4392;
 
 const unsigned char glsl_convolutionDepthwise_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -5582,31 +5499,19 @@ const unsigned char glsl_convolutionDepthwise_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xc7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -5854,7 +5759,7 @@ const unsigned char glsl_convolutionDepthwise_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwise_comp_len = 3868;
+unsigned int glsl_convolutionDepthwise_comp_len = 3724;
 
 const unsigned char glsl_convolutionDepthwise_RELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -5909,31 +5814,19 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xcb, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcc, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -6187,7 +6080,7 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3940;
+unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3796;
 
 const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -6242,31 +6135,19 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xcd, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xce, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -6524,7 +6405,7 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3988;
+unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3844;
 
 const unsigned char glsl_gridSampleBilinear_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -6558,8 +6439,7 @@ const unsigned char glsl_gridSampleBilinear_comp[] = {
   0x70, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x00, 0x00, 0x00, 0x00,
   0x05, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00, 0x75, 0x47, 0x72, 0x69,
   0x64, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x46, 0x01, 0x00, 0x00,
-  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00,
@@ -6574,28 +6454,14 @@ const unsigned char glsl_gridSampleBilinear_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x42, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xa2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x46, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x46, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x46, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x01, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x47, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x01, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x58, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x65, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x7f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x99, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x01, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
   0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
@@ -6849,7 +6715,7 @@ const unsigned char glsl_gridSampleBilinear_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x61, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x61, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gridSampleBilinear_comp_len = 3852;
+unsigned int glsl_gridSampleBilinear_comp_len = 3672;
 
 const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -6883,8 +6749,7 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = {
   0x70, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x00, 0x00, 0x00, 0x00,
   0x05, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x75, 0x47, 0x72, 0x69,
   0x64, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4f, 0x01, 0x00, 0x00,
-  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x2c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00,
@@ -6899,31 +6764,14 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x4c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xac, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x4f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x4f, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x4f, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4f, 0x01, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4f, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x50, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x01, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x68, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x75, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x97, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xac, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb9, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xce, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xdb, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x01, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
   0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
@@ -7248,7 +7096,7 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x6b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x6b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp_len = 4740;
+unsigned int glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp_len = 4524;
 
 const unsigned char glsl_gridSampleNearest_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -7294,28 +7142,16 @@ const unsigned char glsl_gridSampleNearest_comp[] = {
   0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x5d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xc8, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xce, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xce, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -7510,7 +7346,7 @@ const unsigned char glsl_gridSampleNearest_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gridSampleNearest_comp_len = 3096;
+unsigned int glsl_gridSampleNearest_comp_len = 2952;
 
 const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -7556,28 +7392,16 @@ const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = {
   0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x5d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc4, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xd4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xd7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdd, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -7798,7 +7622,7 @@ const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gridSampleNearest_PAD_MODE_ZEROS_comp_len = 3408;
+unsigned int glsl_gridSampleNearest_PAD_MODE_ZEROS_comp_len = 3264;
 
 const unsigned char glsl_relu6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -7833,19 +7657,13 @@ const unsigned char glsl_relu6_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x3d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -7945,7 +7763,7 @@ const unsigned char glsl_relu6_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_relu6_comp_len = 1716;
+unsigned int glsl_relu6_comp_len = 1644;
 
 const unsigned char glsl_binaryImage_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -7991,7 +7809,6 @@ const unsigned char glsl_binaryImage_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x70, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x70, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x75, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -8112,7 +7929,7 @@ const unsigned char glsl_binaryImage_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_comp_len = 1948;
+unsigned int glsl_binaryImage_comp_len = 1936;
 
 const unsigned char glsl_binaryImage_ADD_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -8162,7 +7979,6 @@ const unsigned char glsl_binaryImage_ADD_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -8310,7 +8126,7 @@ const unsigned char glsl_binaryImage_ADD_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_ADD_comp_len = 2320;
+unsigned int glsl_binaryImage_ADD_comp_len = 2308;
 
 const unsigned char glsl_binaryImage_SUB_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -8360,7 +8176,6 @@ const unsigned char glsl_binaryImage_SUB_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -8508,7 +8323,7 @@ const unsigned char glsl_binaryImage_SUB_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_SUB_comp_len = 2320;
+unsigned int glsl_binaryImage_SUB_comp_len = 2308;
 
 const unsigned char glsl_binaryImage_MUL_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -8558,7 +8373,6 @@ const unsigned char glsl_binaryImage_MUL_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -8706,7 +8520,7 @@ const unsigned char glsl_binaryImage_MUL_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_MUL_comp_len = 2320;
+unsigned int glsl_binaryImage_MUL_comp_len = 2308;
 
 const unsigned char glsl_binaryImage_DIV_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -8756,7 +8570,6 @@ const unsigned char glsl_binaryImage_DIV_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x7a, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x7a, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x7f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -8915,7 +8728,7 @@ const unsigned char glsl_binaryImage_DIV_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_DIV_comp_len = 2460;
+unsigned int glsl_binaryImage_DIV_comp_len = 2448;
 
 const unsigned char glsl_binaryImage_POW_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -8965,7 +8778,6 @@ const unsigned char glsl_binaryImage_POW_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -9113,7 +8925,7 @@ const unsigned char glsl_binaryImage_POW_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_POW_comp_len = 2328;
+unsigned int glsl_binaryImage_POW_comp_len = 2316;
 
 const unsigned char glsl_binaryImage_VMAX_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -9163,7 +8975,6 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -9311,7 +9122,7 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_VMAX_comp_len = 2328;
+unsigned int glsl_binaryImage_VMAX_comp_len = 2316;
 
 const unsigned char glsl_binaryImage_SQUDIFF_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -9361,7 +9172,6 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x77, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x7c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -9510,7 +9320,7 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2340;
+unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2328;
 
 const unsigned char glsl_binaryImage_VMIN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -9560,7 +9370,6 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -9708,7 +9517,7 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_binaryImage_VMIN_comp_len = 2328;
+unsigned int glsl_binaryImage_VMIN_comp_len = 2316;
 
 const unsigned char glsl_matmul_input_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -9749,17 +9558,11 @@ const unsigned char glsl_matmul_input_comp[] = {
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x8a, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa7, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb1, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -9941,7 +9744,7 @@ const unsigned char glsl_matmul_input_comp[] = {
   0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_input_comp_len = 2744;
+unsigned int glsl_matmul_input_comp_len = 2672;
 
 const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -9982,17 +9785,11 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = {
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xab, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xab, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xab, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xab, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xac, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb6, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd2, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd2, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -10210,7 +10007,7 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = {
   0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 3176;
+unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 3104;
 
 const unsigned char glsl_nchwToimage_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -10271,13 +10068,10 @@ const unsigned char glsl_nchwToimage_comp[] = {
   0x70, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x72, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xae, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -10478,7 +10272,7 @@ const unsigned char glsl_nchwToimage_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_nchwToimage_comp_len = 3168;
+unsigned int glsl_nchwToimage_comp_len = 3132;
 
 const unsigned char glsl_packAsImage4x4_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -10529,7 +10323,6 @@ const unsigned char glsl_packAsImage4x4_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x91, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xe5, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -10740,7 +10533,7 @@ const unsigned char glsl_packAsImage4x4_comp[] = {
   0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_packAsImage4x4_comp_len = 3092;
+unsigned int glsl_packAsImage4x4_comp_len = 3080;
 
 const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -10791,7 +10584,6 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x91, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x06, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -11038,7 +10830,7 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = {
   0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3524;
+unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3512;
 
 const unsigned char glsl_roipooling_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -11089,7 +10881,6 @@ const unsigned char glsl_roipooling_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0b, 0x01, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x0b, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x1e, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -11387,7 +11178,7 @@ const unsigned char glsl_roipooling_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x2d, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_roipooling_comp_len = 4140;
+unsigned int glsl_roipooling_comp_len = 4128;
 
 const unsigned char glsl_blit_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -11629,7 +11420,6 @@ const unsigned char glsl_blit_image_comp[] = {
   0xc9, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc9, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
@@ -11832,7 +11622,7 @@ const unsigned char glsl_blit_image_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_blit_image_comp_len = 3004;
+unsigned int glsl_blit_image_comp_len = 2992;
 
 const unsigned char glsl_fill_image_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -11870,7 +11660,6 @@ const unsigned char glsl_fill_image_comp[] = {
   0x32, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3e, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
@@ -11942,7 +11731,7 @@ const unsigned char glsl_fill_image_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_fill_image_comp_len = 1272;
+unsigned int glsl_fill_image_comp_len = 1260;
 
 const unsigned char glsl_imageTonchw_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -11987,12 +11776,9 @@ const unsigned char glsl_imageTonchw_comp[] = {
   0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x35, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x05, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
   0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
@@ -12198,7 +11984,7 @@ const unsigned char glsl_imageTonchw_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_imageTonchw_comp_len = 3024;
+unsigned int glsl_imageTonchw_comp_len = 2988;
 
 const unsigned char glsl_softmaxHeight_NHWC_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -12460,19 +12246,13 @@ const unsigned char glsl_resizeNearest_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x86, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x87, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -12629,7 +12409,7 @@ const unsigned char glsl_resizeNearest_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_resizeNearest_comp_len = 2460;
+unsigned int glsl_resizeNearest_comp_len = 2388;
 
 const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -12669,19 +12449,13 @@ const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -12843,7 +12617,7 @@ const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = {
   0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_resizeNearest_NEAREST_ROUND_comp_len = 2516;
+unsigned int glsl_resizeNearest_NEAREST_ROUND_comp_len = 2444;
 
 const unsigned char glsl_reduce_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -13889,22 +13663,13 @@ const unsigned char glsl_resizeBilinear_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x9d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xe7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xe7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe7, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe7, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -14120,7 +13885,7 @@ const unsigned char glsl_resizeBilinear_comp[] = {
   0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_resizeBilinear_comp_len = 3200;
+unsigned int glsl_resizeBilinear_comp_len = 3092;
 
 const unsigned char glsl_nchwTonc4hw4_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -14879,18 +14644,13 @@ const unsigned char glsl_im2col1x1_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xc3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc3, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -15090,7 +14850,7 @@ const unsigned char glsl_im2col1x1_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_im2col1x1_comp_len = 3112;
+unsigned int glsl_im2col1x1_comp_len = 3052;
 
 const unsigned char glsl_avgpool_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -15142,7 +14902,6 @@ const unsigned char glsl_avgpool_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xab, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0xab, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xc0, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -15360,7 +15119,7 @@ const unsigned char glsl_avgpool_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_avgpool_comp_len = 3184;
+unsigned int glsl_avgpool_comp_len = 3172;
 
 const unsigned char glsl_unPackImage4x4_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -15399,15 +15158,9 @@ const unsigned char glsl_unPackImage4x4_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x5f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x8b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x8c, 0x00, 0x00, 0x00,
@@ -15586,7 +15339,7 @@ const unsigned char glsl_unPackImage4x4_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unPackImage4x4_comp_len = 2664;
+unsigned int glsl_unPackImage4x4_comp_len = 2592;
 
 const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -15625,15 +15378,9 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x5f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xac, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0xad, 0x00, 0x00, 0x00,
@@ -15848,7 +15595,7 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3096;
+unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3024;
 
 const unsigned char glsl_maxpool_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -15900,7 +15647,6 @@ const unsigned char glsl_maxpool_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x9c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x9c, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xaf, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -16102,7 +15848,7 @@ const unsigned char glsl_maxpool_comp[] = {
   0x47, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_maxpool_comp_len = 2996;
+unsigned int glsl_maxpool_comp_len = 2984;
 
 const unsigned char glsl_winogradTransformDest2_3_1_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -16175,7 +15921,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x01, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x73, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x73, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x73, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb1, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -16588,7 +16333,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5784;
+unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5772;
 
 const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -16661,7 +16406,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x01, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x77, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x77, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x77, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xbb, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -17087,7 +16831,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5940;
+unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5928;
 
 const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -17160,7 +16904,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x79, 0x01, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x79, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x79, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x79, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -17591,7 +17334,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 6000;
+unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 5988;
 
 const unsigned char glsl_winogradTransformSource2_3_1_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -17660,7 +17403,6 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = {
   0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x95, 0x01, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x95, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x95, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x95, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x54, 0x02, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -18327,7 +18069,7 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_winogradTransformSource2_3_1_comp_len = 8776;
+unsigned int glsl_winogradTransformSource2_3_1_comp_len = 8764;
 
 const unsigned char glsl_col2Im_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -19104,13 +18846,10 @@ const unsigned char glsl_nc4hw4toimage_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x3b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x73, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x74, 0x00, 0x00, 0x00,
@@ -19242,7 +18981,7 @@ const unsigned char glsl_nc4hw4toimage_comp[] = {
   0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_nc4hw4toimage_comp_len = 2240;
+unsigned int glsl_nc4hw4toimage_comp_len = 2204;
 
 const unsigned char glsl_imageTonc4hw4_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -19287,12 +19026,9 @@ const unsigned char glsl_imageTonc4hw4_comp[] = {
   0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x35, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x48, 0x00, 0x05, 0x00, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
   0x45, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
@@ -19434,7 +19170,7 @@ const unsigned char glsl_imageTonc4hw4_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_imageTonc4hw4_comp_len = 2256;
+unsigned int glsl_imageTonc4hw4_comp_len = 2220;
 
 const unsigned char glsl_matmul_output_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -19475,17 +19211,11 @@ const unsigned char glsl_matmul_output_comp[] = {
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x81, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x81, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x81, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x81, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x90, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9d, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xaa, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -19658,7 +19388,7 @@ const unsigned char glsl_matmul_output_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_output_comp_len = 2632;
+unsigned int glsl_matmul_output_comp_len = 2560;
 
 const unsigned char glsl_matmul_output_BIAS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -19703,17 +19433,11 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = {
   0x36, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x95, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x96, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa4, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb1, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbe, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -19900,7 +19624,7 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_output_BIAS_comp_len = 2856;
+unsigned int glsl_matmul_output_BIAS_comp_len = 2784;
 
 const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -19941,17 +19665,11 @@ const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = {
   0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xa3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa3, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa3, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa4, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb1, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbe, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd1, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd1, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -20160,7 +19878,7 @@ const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_output_TRANSPOSE_comp_len = 3064;
+unsigned int glsl_matmul_output_TRANSPOSE_comp_len = 2992;
 
 const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -20205,17 +19923,11 @@ const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = {
   0x36, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xb7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb7, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb7, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb8, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc5, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd2, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdf, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe8, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -20438,7 +20150,7 @@ const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_matmul_output_TRANSPOSE_BIAS_comp_len = 3288;
+unsigned int glsl_matmul_output_TRANSPOSE_BIAS_comp_len = 3216;
 
 const unsigned char glsl_gemm16x16_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -20471,34 +20183,16 @@ const unsigned char glsl_gemm16x16_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x56, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x60, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6f, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x7e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x94, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x94, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x31, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x31, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x31, 0x01, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x31, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x31, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x32, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x3a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x4a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x31, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x53, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
   0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
@@ -20777,7 +20471,7 @@ const unsigned char glsl_gemm16x16_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gemm16x16_comp_len = 4012;
+unsigned int glsl_gemm16x16_comp_len = 3796;
 
 const unsigned char glsl_gemm16x16_FP16_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -20814,34 +20508,16 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = {
   0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xa2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xac, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x13, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x13, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x13, 0x01, 0x00, 0x00,
-  0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x13, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x14, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1d, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x26, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x2f, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x39, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -21138,7 +20814,7 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_gemm16x16_FP16_comp_len = 4276;
+unsigned int glsl_gemm16x16_FP16_comp_len = 4060;
 
 const unsigned char glsl_deconvolutionDepthwise_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -21193,30 +20869,19 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xd6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xd6, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd6, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe9, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -21494,7 +21159,7 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvolutionDepthwise_comp_len = 4216;
+unsigned int glsl_deconvolutionDepthwise_comp_len = 4084;
 
 const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -21549,30 +21214,19 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xda, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xda, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xda, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdb, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xed, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xed, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -21856,7 +21510,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4288;
+unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4156;
 
 const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -21911,30 +21565,19 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = {
   0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0xdc, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdc, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xef, 0x00, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -22222,7 +21865,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = {
   0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4336;
+unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4204;
 
 const unsigned char glsl_preluWithChannel_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -22259,24 +21902,15 @@ const unsigned char glsl_preluWithChannel_comp[] = {
   0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x5b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x5b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5c, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5f, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x72, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
@@ -22400,7 +22034,7 @@ const unsigned char glsl_preluWithChannel_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x3a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x3a, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_preluWithChannel_comp_len = 2088;
+unsigned int glsl_preluWithChannel_comp_len = 1980;
 
 const unsigned char glsl_deconvIm2Col_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -22454,24 +22088,16 @@ const unsigned char glsl_deconvIm2Col_comp[] = {
   0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x00, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x01, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -22760,7 +22386,7 @@ const unsigned char glsl_deconvIm2Col_comp[] = {
   0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvIm2Col_comp_len = 4268;
+unsigned int glsl_deconvIm2Col_comp_len = 4172;
 
 const unsigned char glsl_deconvIm2Col_RELU_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -22814,24 +22440,16 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = {
   0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x04, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x04, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x04, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x04, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x05, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x17, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x17, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -23126,7 +22744,7 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = {
   0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvIm2Col_RELU_comp_len = 4340;
+unsigned int glsl_deconvIm2Col_RELU_comp_len = 4244;
 
 const unsigned char glsl_deconvIm2Col_RELU6_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -23180,24 +22798,16 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = {
   0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x06, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x06, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   0x47, 0x00, 0x04, 0x00, 0x06, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x06, 0x01, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x07, 0x01, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x01, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x01, 0x00, 0x00,
   0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
   0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
@@ -23496,7 +23106,7 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = {
   0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00,
   0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4388;
+unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4292;
 
 const unsigned char glsl_buffer2Image1D_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
@@ -23667,7 +23277,6 @@ const unsigned char glsl_scale_comp[] = {
   0x47, 0x00, 0x04, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6c, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
-  0x6c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
   0x6c, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
   0x71, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
   0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
@@ -23791,7 +23400,365 @@ const unsigned char glsl_scale_comp[] = {
   0xf9, 0x00, 0x02, 0x00, 0x24, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
   0x24, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_scale_comp_len = 2196;
+unsigned int glsl_scale_comp_len = 2184;
+
+const unsigned char glsl_argmax_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
+  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
+  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x48, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x37, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x5a, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x61, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0xba, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8a, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_argmax_comp_len = 2096;
+
+const unsigned char glsl_argmax_ARGMIN_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
+  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
+  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x48, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0a, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x37, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x5a, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x61, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8a, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_argmax_ARGMIN_comp_len = 2096;
 
 const unsigned char glsl_buffer2Image3D_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
diff --git a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
index 26e804ec4..5575d39eb 100644
--- a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
+++ b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
@@ -109,6 +109,8 @@ mMaps.insert(std::make_pair("glsl_deconvIm2Col_RELU_comp", std::make_pair(glsl_d
 mMaps.insert(std::make_pair("glsl_deconvIm2Col_RELU6_comp", std::make_pair(glsl_deconvIm2Col_RELU6_comp,glsl_deconvIm2Col_RELU6_comp_len)));
 mMaps.insert(std::make_pair("glsl_buffer2Image1D_comp", std::make_pair(glsl_buffer2Image1D_comp,glsl_buffer2Image1D_comp_len)));
 mMaps.insert(std::make_pair("glsl_scale_comp", std::make_pair(glsl_scale_comp,glsl_scale_comp_len)));
+mMaps.insert(std::make_pair("glsl_argmax_comp", std::make_pair(glsl_argmax_comp,glsl_argmax_comp_len)));
+mMaps.insert(std::make_pair("glsl_argmax_ARGMIN_comp", std::make_pair(glsl_argmax_ARGMIN_comp,glsl_argmax_ARGMIN_comp_len)));
 mMaps.insert(std::make_pair("glsl_buffer2Image3D_comp", std::make_pair(glsl_buffer2Image3D_comp,glsl_buffer2Image3D_comp_len)));
 }
 }
diff --git a/source/backend/vulkan/image/compiler/makeshader.py b/source/backend/vulkan/image/compiler/makeshader.py
index 42a904a86..f94765441 100755
--- a/source/backend/vulkan/image/compiler/makeshader.py
+++ b/source/backend/vulkan/image/compiler/makeshader.py
@@ -405,7 +405,8 @@ def genCppFile(objs, inc, dst):
             if len(spirv_save) > 0:
                 out = spirv_save
                 rm = False
-            print(os.popen("glslangValidator -V " + s + " -Os -o " + out).read())
+            cmd = "glslangValidator -V " + s + " -Os -o " + out
+            print(os.popen(cmd).read())
         else:
             out = spirv_cache
             rm = False
diff --git a/source/backend/vulkan/image/execution/VulkanArgMax.cpp b/source/backend/vulkan/image/execution/VulkanArgMax.cpp
new file mode 100644
index 000000000..cc97cdb74
--- /dev/null
+++ b/source/backend/vulkan/image/execution/VulkanArgMax.cpp
@@ -0,0 +1,129 @@
+//
+//  VulkanArgMax.cpp
+//  MNN
+//
+//  Created by MNN on 2024/08/20.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "VulkanArgMax.hpp"
+
+namespace MNN {
+
+struct GpuArgMaxParam {
+    ivec4 size; // inside, mid, outside, 0
+};
+
+VulkanArgMax::VulkanArgMax(const Op* op, Backend* bn) : VulkanBasicExecution(bn) {
+    auto vkBn = (VulkanBackend *)backend();
+
+    mAxis = op->main_as_ArgMax()->axis();
+
+    std::vector<VkDescriptorType> types{
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+    };
+    if (op->type() == OpType_ArgMax) {
+        mArgMaxPipeline =
+            vkBn->getPipeline("glsl_argmax_comp", types);
+    } else {
+        MNN_ASSERT(op->type() == OpType_ArgMin);
+        mArgMaxPipeline =
+            vkBn->getPipeline("glsl_argmax_ARGMIN_comp", types);
+    }
+
+    mGpuArgMaxParam.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(GpuArgMaxParam), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
+    mDescriptorSet.reset(mArgMaxPipeline->createSet());
+}
+
+VulkanArgMax::~VulkanArgMax() {
+}
+
+// set descriptorSet， including output, input and GPU param
+ErrorCode VulkanArgMax::onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                       const VulkanCommandPool::Buffer* cmdBuffer) {
+    auto vkBn = (VulkanBackend*)backend();
+    auto input  = inputs[0];
+    auto output = outputs[0];
+
+    // set GPU param
+    auto axis = mAxis;
+    if (axis < 0) {
+        axis = input->dimensions() + axis;
+    }
+    int inside = 1;
+    int outside = 1;
+    int mid = input->length(axis);
+    for (int i=0; i<axis; ++i) {
+        outside *= input->length(i);
+    }
+    for (int i=axis+1; i<input->dimensions(); ++i) {
+        inside *= input->length(i);
+    }
+    auto total = outside * inside;
+
+    auto Argmax = reinterpret_cast<GpuArgMaxParam *>(mGpuArgMaxParam->map());
+    Argmax->size[0] = inside;
+    Argmax->size[1] = mid;
+    Argmax->size[2] = outside;
+    Argmax->size[3] = 0;
+    mGpuArgMaxParam->unmap();
+
+    // set necessary storages, set descriptorSet and bind commandBuffer
+    {
+        int bufferSizeSource = sizeof(float);
+        for (int i=0; i<input->dimensions(); ++i) {
+            bufferSizeSource *= input->length(i);
+        }
+        mSource.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+        mSource.convert.reset(new VulkanImageConverter(vkBn));
+    }
+    {
+        int bufferSizeOutput = sizeof(float);
+        for (int i=0; i<output->dimensions(); ++i) {
+            bufferSizeOutput *= output->length(i);
+        }
+        mOutput.convert.reset(new VulkanImageConverter(vkBn));
+        mOutput.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeOutput, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+    }
+
+    mSource.convert->encodeTensorToBuffer(input, mSource.buffer->buffer(), mSource.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(input), cmdBuffer);
+
+    mDescriptorSet->writeBuffer(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
+    mDescriptorSet->writeBuffer(mSource.buffer->buffer(), 1, mSource.buffer->size());
+    mDescriptorSet->writeBuffer(mGpuArgMaxParam->buffer(), 2, mGpuArgMaxParam->size());
+
+    cmdBuffer->barrierSource(mSource.buffer->buffer(), 0, mSource.buffer->size());
+
+    mArgMaxPipeline->bind(cmdBuffer->get(), mDescriptorSet->get());
+    vkCmdDispatch(cmdBuffer->get(), UP_DIV(total, 256), 1, 1);
+
+    cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
+    mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer);
+    {
+        mSource.buffer->release();
+        mOutput.buffer->release();
+    }
+    return NO_ERROR;
+}
+
+class VulkanArgMaxCreator : public VulkanBackend::Creator {
+public:
+    virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
+                                Backend* backend) const override {
+        if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+            // Don't support legency version
+            return nullptr;
+        }
+        return new VulkanArgMax(op, backend);
+    }
+};
+
+static bool gResistor = []() {
+    VulkanBackend::addCreator(OpType_ArgMax, new VulkanArgMaxCreator);
+    VulkanBackend::addCreator(OpType_ArgMin, new VulkanArgMaxCreator);
+    return true;
+}();
+
+}
diff --git a/source/backend/vulkan/image/execution/VulkanArgMax.hpp b/source/backend/vulkan/image/execution/VulkanArgMax.hpp
new file mode 100644
index 000000000..31d39c795
--- /dev/null
+++ b/source/backend/vulkan/image/execution/VulkanArgMax.hpp
@@ -0,0 +1,40 @@
+//
+//  VulkanArgMax.cpp
+//  MNN
+//
+//  Created by MNN on 2024/08/20.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef VulkanArgMax_hpp
+#define VulkanArgMax_hpp
+
+#include "VulkanBasicExecution.hpp"
+#include "VulkanImageConverter.hpp"
+
+namespace MNN {
+class VulkanArgMax : public VulkanBasicExecution {
+
+public:
+    VulkanArgMax(const Op* op, Backend* bn);
+    virtual ~VulkanArgMax();
+    ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                       const VulkanCommandPool::Buffer* cmdBuffer) override;
+
+private:
+    const VulkanPipeline* mArgMaxPipeline;
+    std::shared_ptr<VulkanLayout::DescriptorSet> mDescriptorSet;
+    std::shared_ptr<VulkanBuffer> mGpuArgMaxParam;
+    struct ConvertInfo {
+        const VulkanPipeline* pipeline;
+        std::shared_ptr<VulkanImageConverter> convert;
+        std::shared_ptr<VulkanBuffer> buffer;
+    };
+    ConvertInfo mSource;
+    ConvertInfo mOutput;
+    int mAxis;
+};
+
+} // namespace MNN
+
+#endif /* VulkanArgMax_hpp */
diff --git a/source/backend/vulkan/image/execution/VulkanBinary.cpp b/source/backend/vulkan/image/execution/VulkanBinary.cpp
index ece019c90..2cc3e9037 100644
--- a/source/backend/vulkan/image/execution/VulkanBinary.cpp
+++ b/source/backend/vulkan/image/execution/VulkanBinary.cpp
@@ -173,6 +173,9 @@ class VulkanBinaryCreator : public VulkanBackend::Creator {
     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
                                 Backend* backend) const override {
         auto input0 = inputs[0];
+        if (input0->getType().code != halide_type_float) {
+            return nullptr;
+        }
         auto image = TensorUtils::getDescribe(input0)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4;
         auto shader = _getShaderName(op, image);
         if (shader.empty()) {
diff --git a/source/backend/vulkan/image/execution/VulkanConvolution.cpp b/source/backend/vulkan/image/execution/VulkanConvolution.cpp
index 2730b504b..b0e3d7ab1 100644
--- a/source/backend/vulkan/image/execution/VulkanConvolution.cpp
+++ b/source/backend/vulkan/image/execution/VulkanConvolution.cpp
@@ -255,7 +255,7 @@ class VulkanConvolutionCreator : public VulkanBackend::Creator {
                     return nullptr;
                 }
             }
-            quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true);
+            quanWeight = ConvolutionCommon::load(op, backend, true);
             srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw);
             source   = quanWeight->weightFloat.get();
             weightSize = quanWeight->weightFloat.size();
diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp b/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp
index 22b906356..ae9dab1d5 100644
--- a/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp
+++ b/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp
@@ -20,7 +20,8 @@ static void writeReorderBuffer(VulkanMatMul::Reorder::nchwBuffer& buffer, int co
     buffer.stride[3] = 1;
 }
 
-VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector<Tensor*>& inputs, const Convolution2D* conv) : VulkanBasicExecution(bn) {
+VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector<Tensor*>& inputs, const Op* op) : VulkanBasicExecution(bn) {
+    auto conv = op->main_as_Convolution2D();
     mConvCommonOption = conv->common();
     auto vkBn         = (VulkanBackend*)bn;
     mConvParam = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false,
@@ -34,7 +35,7 @@ VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector<Tensor*>
     const float* filterDataPtr = nullptr;
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &tempWeightSize);
 
     if (nullptr != filterDataPtr) {
         MNN_ASSERT(inputs.size() == 1);
@@ -142,7 +143,7 @@ ErrorCode VulkanDeconvolution::onEncode(const std::vector<Tensor*>& inputs, cons
 
         dstImage->barrierWrite(cmdBuffer->get());
         (reinterpret_cast<VulkanTensor*>(src->deviceId()))->image()->barrierRead(cmdBuffer->get());
-        
+
         vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalInputSize, VulkanConvolutionCommon::gImage2ColLocal), 1, 1);
     }
 
@@ -176,7 +177,7 @@ class VulkanDeconvolutionCreator : public VulkanBackend::Creator {
 public:
     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
                                 Backend* backend) const override {
-        return new VulkanDeconvolution(backend, inputs, op->main_as_Convolution2D());
+        return new VulkanDeconvolution(backend, inputs, op);
     }
 };
 
diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp b/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp
index e133193b7..daeec8950 100644
--- a/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp
+++ b/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp
@@ -18,7 +18,7 @@ class VulkanDeconvolution : public VulkanBasicExecution {
     virtual ~VulkanDeconvolution() {
     }
 
-    VulkanDeconvolution(Backend* bn, const std::vector<Tensor*>& inputs, const Convolution2D* conv);
+    VulkanDeconvolution(Backend* bn, const std::vector<Tensor*>& inputs, const Op* op);
     virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const VulkanCommandPool::Buffer* cmdBuffer) override;
 
diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp
index 5f9855cc5..0863aaa47 100644
--- a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp
+++ b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp
@@ -9,8 +9,9 @@
 #include "VulkanDeconvolutionDepthwise.hpp"
 #include "core/Macro.h"
 namespace MNN {
-VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Convolution2D* conv)
+VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Op* op)
     : VulkanBasicExecution(bn) {
+    auto conv = op->main_as_Convolution2D();
     mConvCommonOption = conv->common();
     auto vkBn         = (VulkanBackend*)bn;
     int outputC4      = UP_DIV(mConvCommonOption->outputCount(), 4);
@@ -41,7 +42,7 @@ VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Co
     const float* tempWeight = nullptr;
     int tempWeightSize   = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &tempWeight, &tempWeightSize);
+    ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &tempWeight, &tempWeightSize);
 
     for (int b = 0; b < co; ++b) {
         int b_4      = b / 4;
@@ -112,7 +113,7 @@ class VulkanDeconvolutionDepthwiseCreator : public VulkanBackend::Creator {
         if (inputs.size() > 1) {
             return nullptr;
         }
-        return new VulkanDeconvolutionDepthwise(backend, op->main_as_Convolution2D());
+        return new VulkanDeconvolutionDepthwise(backend, op);
     }
 };
 
diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp
index 690031968..412bd1957 100644
--- a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp
+++ b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp
@@ -17,7 +17,7 @@ class VulkanDeconvolutionDepthwise : public VulkanBasicExecution {
     virtual ~VulkanDeconvolutionDepthwise() {
     }
 
-    VulkanDeconvolutionDepthwise(Backend* bn, const Convolution2D* conv);
+    VulkanDeconvolutionDepthwise(Backend* bn, const Op* op);
     virtual ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const VulkanCommandPool::Buffer* cmdBuffer) override;
 
diff --git a/source/backend/vulkan/image/execution/VulkanRaster.cpp b/source/backend/vulkan/image/execution/VulkanRaster.cpp
index d4d6e016b..d5cc81b07 100644
--- a/source/backend/vulkan/image/execution/VulkanRaster.cpp
+++ b/source/backend/vulkan/image/execution/VulkanRaster.cpp
@@ -236,6 +236,9 @@ ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &___inputs, const s
 class VulkanRasterCreator : public VulkanBackend::Creator {
 public:
     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
+        if (outputs[0]->getType().bytes() < 4) {
+            return nullptr;
+        }
         return new VulkanRaster(bn);
     }
 };
diff --git a/source/backend/vulkan/image/execution/VulkanUnary.cpp b/source/backend/vulkan/image/execution/VulkanUnary.cpp
index 574ccfcc2..839c43d48 100644
--- a/source/backend/vulkan/image/execution/VulkanUnary.cpp
+++ b/source/backend/vulkan/image/execution/VulkanUnary.cpp
@@ -74,6 +74,8 @@ static std::string _getMidType(const Op* op) {
             SETTYPE(UnaryOpOperation_ROUND, "ROUND");
             SETTYPE(UnaryOpOperation_HARDSWISH, "HARDSWISH");
             SETTYPE(UnaryOpOperation_GELU, "GELU");
+            // Since SPIR-V lacks a built-in erf (gauss error function) instruction and the existing shader implementation of GELU is essentially an approximation of erf, there is no need to add a new implementation of GELU_STANDARD.
+            SETTYPE(UnaryOpOperation_GELU_STANDARD, "GELU");
         } while(false);
 #undef SETTYPE
     }
diff --git a/source/backend/vulkan/image/execution/glsl/argmax.comp b/source/backend/vulkan/image/execution/glsl/argmax.comp
new file mode 100644
index 000000000..4a52df6d8
--- /dev/null
+++ b/source/backend/vulkan/image/execution/glsl/argmax.comp
@@ -0,0 +1,51 @@
+#version 440 core
+#define FLOAT float
+
+layout(std430) buffer;
+layout(set=0, binding=0) writeonly buffer destBuffer{
+    int data[];
+} uOutput;
+
+layout(set=0, binding=1) readonly buffer sourceBuffer0{
+    FLOAT data[];
+} uInput;
+
+layout(set=0, binding=2) uniform constBuffer {
+    int w; //inside
+    int h; //axis
+    int c; //outside
+    float k; // 0
+}uConst;
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+void main()
+{
+    ivec3 posTmp = ivec3(gl_GlobalInvocationID);
+    ivec2 pos;
+    pos.x = posTmp.x / uConst.w;
+    pos.y = posTmp.x % uConst.w;
+    // x: index in outside, y: index in inside
+    if(pos.y < uConst.w && pos.x < uConst.c)
+    {
+        int basicOffset = pos.x * uConst.w * uConst.h + pos.y;
+        FLOAT value = uInput.data[basicOffset];
+        int index = 0;
+        for(int i = 1; i < uConst.h; ++i)
+        {
+            FLOAT valueCurr = uInput.data[basicOffset + i * uConst.w];
+#ifndef ARGMIN
+            if (valueCurr > value) {
+                value = valueCurr;
+                index = i;
+            }
+#else
+            if (valueCurr < value) {
+                value = valueCurr;
+                index = i;
+            }
+#endif
+        }
+        uOutput.data[posTmp.x] = index;
+    }
+}
diff --git a/source/backend/vulkan/image/execution/glsl/avgpool.comp b/source/backend/vulkan/image/execution/glsl/avgpool.comp
index 5548222d2..a0b46a905 100644
--- a/source/backend/vulkan/image/execution/glsl/avgpool.comp
+++ b/source/backend/vulkan/image/execution/glsl/avgpool.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std140) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 
 
diff --git a/source/backend/vulkan/image/execution/glsl/binaryImage.comp b/source/backend/vulkan/image/execution/glsl/binaryImage.comp
index 5e79b2256..c7d25085b 100644
--- a/source/backend/vulkan/image/execution/glsl/binaryImage.comp
+++ b/source/backend/vulkan/image/execution/glsl/binaryImage.comp
@@ -1,5 +1,5 @@
 #version 440 core
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput0;
 layout(set=0, binding=2) uniform sampler2D uInput1;
 
diff --git a/source/backend/vulkan/image/execution/glsl/blit_image.comp b/source/backend/vulkan/image/execution/glsl/blit_image.comp
index d1d6eeca5..4c2d000b4 100644
--- a/source/backend/vulkan/image/execution/glsl/blit_image.comp
+++ b/source/backend/vulkan/image/execution/glsl/blit_image.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set=0, binding=2) uniform constBuffer{
diff --git a/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp b/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp
index b15b3464f..b34b48c96 100644
--- a/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp
+++ b/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp
@@ -1,15 +1,13 @@
 #version 440 core
 
-layout(std140) buffer;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uKernel;
 
-layout(set=0, binding=2) uniform mediump sampler2D uKernel;
+layout(set=0, binding=3) uniform sampler2D uBias;
 
-layout(set=0, binding=3) uniform mediump sampler2D uBias;
-
-layout(set=0, binding=4) uniform constBuffer {
+layout(set=0, binding=4) readonly uniform constBuffer {
     ivec2 pad;
     ivec2 kernelSize;
     ivec2 stride;
diff --git a/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp b/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp
index f9d81e461..39d068b40 100644
--- a/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp
+++ b/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp
@@ -1,15 +1,13 @@
 #version 440 core
 
-layout(std140) buffer;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uKernel;
 
-layout(set=0, binding=2) uniform mediump sampler2D uKernel;
+layout(set=0, binding=3) uniform sampler2D uBias;
 
-layout(set=0, binding=3) uniform mediump sampler2D uBias;
-
-layout(set=0, binding=4) uniform constBuffer {
+layout(set=0, binding=4) readonly uniform constBuffer {
     ivec2 pad;
     ivec2 kernelSize;
     ivec2 stride;
diff --git a/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp b/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp
index 5a9283986..a394d537d 100644
--- a/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp
+++ b/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp
@@ -1,7 +1,7 @@
 #version 440 core
 
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
-layout(set=0, binding=1) writeonly uniform mediump image2D uOutput;
+layout(set=0, binding=0) uniform sampler2D uInput;
+layout(set=0, binding=1) writeonly uniform image2D uOutput;
 
 layout(set=0, binding=2) readonly uniform constBuffer {
     ivec2 pad;
diff --git a/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp b/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp
index a56c8fc2e..97dc19d28 100644
--- a/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp
+++ b/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp
@@ -1,9 +1,9 @@
 #version 440 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uBias;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uBias;
 
 layout(set=0, binding=3) readonly uniform constBuffer {
     ivec2 pad;
diff --git a/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp b/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp
index c22eb2d49..32b5e9219 100644
--- a/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp
+++ b/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp
@@ -1,10 +1,10 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uKernel;
-layout(set=0, binding=3) uniform mediump sampler2D uBias;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uKernel;
+layout(set=0, binding=3) uniform sampler2D uBias;
 
 layout(set=0, binding=4) readonly uniform constBuffer {
     ivec2 pad;
diff --git a/source/backend/vulkan/image/execution/glsl/fill_image.comp b/source/backend/vulkan/image/execution/glsl/fill_image.comp
index abba311f6..9a4818051 100644
--- a/source/backend/vulkan/image/execution/glsl/fill_image.comp
+++ b/source/backend/vulkan/image/execution/glsl/fill_image.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform constBuffer{
     vec4 value;
     ivec4 imageSize;
diff --git a/source/backend/vulkan/image/execution/glsl/gemm16x16.comp b/source/backend/vulkan/image/execution/glsl/gemm16x16.comp
index 636756974..b71b66207 100644
--- a/source/backend/vulkan/image/execution/glsl/gemm16x16.comp
+++ b/source/backend/vulkan/image/execution/glsl/gemm16x16.comp
@@ -9,11 +9,11 @@
 #define MAT4 mat4
 #endif
 layout(std140) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uKernel;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uKernel;
 
-layout(set=0, binding=3) readonly restrict uniform constBuffer {
+layout(set=0, binding=3) readonly uniform constBuffer {
     ivec4 outputSize;
 }uConst;
 
diff --git a/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp b/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp
index 801108ed0..358b42c1b 100644
--- a/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp
+++ b/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp
@@ -1,9 +1,9 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uGrid;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uGrid;
 
 layout(set=0, binding=3) uniform gridSampleBuffer{
 	ivec4 outImgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp b/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp
index fa8a9f041..20adc3a93 100644
--- a/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp
+++ b/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp
@@ -1,9 +1,9 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uGrid;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uGrid;
 
 layout(set=0, binding=3) uniform gridSampleBuffer{
 	ivec4 outImgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/im2col.comp b/source/backend/vulkan/image/execution/glsl/im2col.comp
index f62ee3861..9485c931e 100644
--- a/source/backend/vulkan/image/execution/glsl/im2col.comp
+++ b/source/backend/vulkan/image/execution/glsl/im2col.comp
@@ -1,8 +1,8 @@
 #version 440 core
 layout(std140) buffer;
 
-layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set=0, binding=2) readonly uniform constBuffer {
     ivec2 pad;
diff --git a/source/backend/vulkan/image/execution/glsl/im2col1x1.comp b/source/backend/vulkan/image/execution/glsl/im2col1x1.comp
index bbac4e8f4..ac4306af5 100644
--- a/source/backend/vulkan/image/execution/glsl/im2col1x1.comp
+++ b/source/backend/vulkan/image/execution/glsl/im2col1x1.comp
@@ -1,8 +1,8 @@
 #version 440 core
 layout(std140) buffer;
 
-layout(set=0, binding=0) writeonly mediump uniform image2D uOutput;
-layout(set=0, binding=1) mediump uniform sampler2D uInput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set=0, binding=2) readonly uniform constBuffer {
     ivec2 pad;
diff --git a/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp b/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp
index 66beb5094..afcb45b1c 100644
--- a/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp
+++ b/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) uniform sampler2D uInput;
 
 layout(set=0, binding=1) writeonly buffer destBuffer{
     vec4 data[];
diff --git a/source/backend/vulkan/image/execution/glsl/imageTonchw.comp b/source/backend/vulkan/image/execution/glsl/imageTonchw.comp
index 4b7d3d694..de2f4b7e3 100644
--- a/source/backend/vulkan/image/execution/glsl/imageTonchw.comp
+++ b/source/backend/vulkan/image/execution/glsl/imageTonchw.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) uniform sampler2D uInput;
 
 layout(set=0, binding=1) writeonly buffer destBuffer{
     float data[];
diff --git a/source/backend/vulkan/image/execution/glsl/macro.json b/source/backend/vulkan/image/execution/glsl/macro.json
index ff07a939e..3c964eff1 100644
--- a/source/backend/vulkan/image/execution/glsl/macro.json
+++ b/source/backend/vulkan/image/execution/glsl/macro.json
@@ -1,4 +1,7 @@
 {
+    "argmax.comp":[
+        "ARGMIN"
+    ],
     "matmul_output.comp":[
         "BIAS",
         "TRANSPOSE",
diff --git a/source/backend/vulkan/image/execution/glsl/matmul_input.comp b/source/backend/vulkan/image/execution/glsl/matmul_input.comp
index 5f2d76109..d413e650a 100644
--- a/source/backend/vulkan/image/execution/glsl/matmul_input.comp
+++ b/source/backend/vulkan/image/execution/glsl/matmul_input.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform offsetBuffer {
     ivec4 size;//w/4, h/4, hLimit, w/4*h/4
diff --git a/source/backend/vulkan/image/execution/glsl/matmul_output.comp b/source/backend/vulkan/image/execution/glsl/matmul_output.comp
index d65483b74..6f994c114 100644
--- a/source/backend/vulkan/image/execution/glsl/matmul_output.comp
+++ b/source/backend/vulkan/image/execution/glsl/matmul_output.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform offsetBuffer {
     ivec4 size;//w/4, h/4, w, w/4*h/4
diff --git a/source/backend/vulkan/image/execution/glsl/maxpool.comp b/source/backend/vulkan/image/execution/glsl/maxpool.comp
index 222b9cb5e..c644709d0 100644
--- a/source/backend/vulkan/image/execution/glsl/maxpool.comp
+++ b/source/backend/vulkan/image/execution/glsl/maxpool.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 
 
diff --git a/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp b/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp
index 26a9f867a..f9bfb0b46 100644
--- a/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp
+++ b/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 
 layout(set=0, binding=1) readonly buffer destBuffer{
     vec4 data[];
diff --git a/source/backend/vulkan/image/execution/glsl/nchwToimage.comp b/source/backend/vulkan/image/execution/glsl/nchwToimage.comp
index 34db019bc..ac240e46a 100644
--- a/source/backend/vulkan/image/execution/glsl/nchwToimage.comp
+++ b/source/backend/vulkan/image/execution/glsl/nchwToimage.comp
@@ -1,6 +1,6 @@
 #version 450 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 
 layout(set=0, binding=1) readonly buffer sourceBuffer{
     float data[];
diff --git a/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp b/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp
index 43ec6b1ff..6705b1cee 100644
--- a/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp
+++ b/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict highp uniform image2D uOutput;
+layout(set=0, binding=0) writeonly highp uniform image2D uOutput;
 
 layout(set=0, binding=1) readonly buffer sourceBuffer{
     vec4 data[];
diff --git a/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp b/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp
index b9be73066..79556d6b1 100644
--- a/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp
+++ b/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp
@@ -1,9 +1,9 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
-layout(set=0, binding=2) uniform mediump sampler2D uSlope;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
+layout(set=0, binding=2) uniform sampler2D uSlope;
 
 layout(set = 0, binding = 3) uniform reluBuffer{
     ivec4 imgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/relu.comp b/source/backend/vulkan/image/execution/glsl/relu.comp
index 118f716bd..0597d8aaa 100644
--- a/source/backend/vulkan/image/execution/glsl/relu.comp
+++ b/source/backend/vulkan/image/execution/glsl/relu.comp
@@ -1,7 +1,7 @@
 #version 450 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set = 0, binding = 2) uniform reluBuffer{
     ivec4 imgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/relu6.comp b/source/backend/vulkan/image/execution/glsl/relu6.comp
index 84d4da59e..60f5f1fba 100644
--- a/source/backend/vulkan/image/execution/glsl/relu6.comp
+++ b/source/backend/vulkan/image/execution/glsl/relu6.comp
@@ -1,7 +1,7 @@
 #version 450 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput;
-layout(set=0, binding=1) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set = 0, binding = 2) uniform reluBuffer{
     ivec4 imgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp b/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp
index 2740f3776..83902f7a9 100644
--- a/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp
+++ b/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp
@@ -1,8 +1,8 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
-layout(set=0, binding=1) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) uniform sampler2D uInput;
+layout(set=0, binding=1) writeonly uniform image2D uOutput;
 
 layout(set = 0, binding = 2) uniform reluBuffer{
     ivec4 inImgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/resizeNearest.comp b/source/backend/vulkan/image/execution/glsl/resizeNearest.comp
index ae12f258f..352dc6e28 100644
--- a/source/backend/vulkan/image/execution/glsl/resizeNearest.comp
+++ b/source/backend/vulkan/image/execution/glsl/resizeNearest.comp
@@ -1,8 +1,8 @@
 #version 450 core
 layout(std430) buffer;
 
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
-layout(set=0, binding=1) writeonly restrict mediump uniform image2D uOutput;
+layout(set=0, binding=0) uniform sampler2D uInput;
+layout(set=0, binding=1) writeonly uniform image2D uOutput;
 
 layout(set = 0, binding = 2) uniform reluBuffer{
     ivec4 inImgSize;
diff --git a/source/backend/vulkan/image/execution/glsl/roipooling.comp b/source/backend/vulkan/image/execution/glsl/roipooling.comp
index 36db110aa..f0c7a582f 100644
--- a/source/backend/vulkan/image/execution/glsl/roipooling.comp
+++ b/source/backend/vulkan/image/execution/glsl/roipooling.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std140) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set=0, binding=2) uniform sampler2D uRoI;
diff --git a/source/backend/vulkan/image/execution/glsl/scale.comp b/source/backend/vulkan/image/execution/glsl/scale.comp
index 52b53d9a5..1ead1fa41 100644
--- a/source/backend/vulkan/image/execution/glsl/scale.comp
+++ b/source/backend/vulkan/image/execution/glsl/scale.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std140) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 
 layout(set = 0, binding = 2) readonly buffer scaleBuffer{
diff --git a/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp b/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp
index c10f6812c..57b22d56f 100644
--- a/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp
+++ b/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp
@@ -1,6 +1,6 @@
 #version 440 core
 layout(std430) buffer;
-layout(set=0, binding=0) uniform mediump sampler2D uInput;
+layout(set=0, binding=0) uniform sampler2D uInput;
 
 layout(set=0, binding=1) writeonly buffer sourceBuffer{
     vec4 data[];
diff --git a/source/backend/vulkan/image/execution/glsl/unaryImage.comp b/source/backend/vulkan/image/execution/glsl/unaryImage.comp
index 900ab129d..6483e2bf6 100644
--- a/source/backend/vulkan/image/execution/glsl/unaryImage.comp
+++ b/source/backend/vulkan/image/execution/glsl/unaryImage.comp
@@ -1,5 +1,5 @@
 #version 440
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform constBuffer{
     ivec4 size; // x: limit, y: channelC4*b, z:height, w:width
diff --git a/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp b/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp
index a5b0e600c..1317d1fa5 100644
--- a/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp
+++ b/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp
@@ -1,9 +1,9 @@
 #version 450 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
 layout(set=0, binding=2) uniform sampler2D uBias;
-layout(set=0, binding=3) readonly restrict uniform constBuffer {
+layout(set=0, binding=3) readonly uniform constBuffer {
     ivec4 inputSize;
     ivec4 outputSize;
     int padX;
@@ -12,7 +12,7 @@ layout(set=0, binding=3) readonly restrict uniform constBuffer {
     int unitHeight;
     int unit;
 } uConst;
-layout(set=0, binding=4) readonly restrict uniform offsetBuffer {
+layout(set=0, binding=4) readonly uniform offsetBuffer {
     ivec2 offset;
 } uOffset;
 
diff --git a/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp b/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp
index 52ac4c6f6..6b30d22c3 100644
--- a/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp
+++ b/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp
@@ -1,8 +1,8 @@
 #version 450 core
 layout(std430) buffer;
-layout(set=0, binding=0) writeonly restrict uniform image2D uOutput;
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
 layout(set=0, binding=1) uniform sampler2D uInput;
-layout(set=0, binding=2) readonly restrict uniform constBuffer {
+layout(set=0, binding=2) readonly uniform constBuffer {
     ivec4 inputSize;
     ivec4 outputSize;
     int padX;
@@ -12,7 +12,7 @@ layout(set=0, binding=2) readonly restrict uniform constBuffer {
     int unit;
 } uConst;
 
-layout(set=0, binding=3) readonly restrict uniform offsetBuffer {
+layout(set=0, binding=3) readonly uniform offsetBuffer {
     ivec2 offset;
 } uOffset;
 int CLAMP_ADD(int x) {
diff --git a/source/backend/vulkan/image/shaders/AllShader.h b/source/backend/vulkan/image/shaders/AllShader.h
index ada1630e4..4297b2ced 100644
--- a/source/backend/vulkan/image/shaders/AllShader.h
+++ b/source/backend/vulkan/image/shaders/AllShader.h
@@ -212,6 +212,10 @@ extern const unsigned char glsl_buffer2Image1D_comp[];
 extern unsigned int glsl_buffer2Image1D_comp_len;
 extern const unsigned char glsl_scale_comp[];
 extern unsigned int glsl_scale_comp_len;
+extern const unsigned char glsl_argmax_comp[];
+extern unsigned int glsl_argmax_comp_len;
+extern const unsigned char glsl_argmax_ARGMIN_comp[];
+extern unsigned int glsl_argmax_ARGMIN_comp_len;
 extern const unsigned char glsl_buffer2Image3D_comp[];
 extern unsigned int glsl_buffer2Image3D_comp_len;
 #endif
\ No newline at end of file
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.cpp b/source/backend/vulkan/runtime/VulkanRuntime.cpp
index 3ef3b722d..795c24f99 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.cpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.cpp
@@ -34,22 +34,40 @@ class VulkanBufferAllocator : public BufferAllocator::Allocator {
 float VulkanRuntime::onGetMemoryInMB() {
     return mMemoryPool->computeSize();
 }
-
-VulkanRuntime::VulkanRuntime(const Backend::Info& info) {
-    mInfo = info;
+VulkanRuntime* VulkanRuntime::create(const Backend::Info& info) {
     MNNVulkanContext* context = nullptr;
+    std::shared_ptr<VulkanDevice> device;
+    std::shared_ptr<VulkanInstance> instance;
     if (nullptr != info.user && nullptr != info.user->sharedContext) {
        MNN_PRINT("Use user's vulkan context\n");
        context = static_cast<MNNVulkanContext*>(info.user->sharedContext);
     }
     if (NULL != context) {
-        mInstance = std::make_shared<VulkanInstance>(context->pInstance);
-        mDevice   = std::make_shared<VulkanDevice>(mInstance, context->pPhysicalDevice, context->pDevice,
+        instance = std::make_shared<VulkanInstance>(context->pInstance);
+        if (context->pInstance == VK_NULL_HANDLE) {
+            MNN_ERROR("Invalide user's vulkan instance\n");
+            return nullptr;
+        }
+        device   = std::make_shared<VulkanDevice>(instance, context->pPhysicalDevice, context->pDevice,
                                                  context->iQueueFamilyIndex, context->pQueue);
     } else {
-        mInstance = std::make_shared<VulkanInstance>();
-        mDevice   = std::make_shared<VulkanDevice>(mInstance);
+        instance = std::make_shared<VulkanInstance>();
+        if (!instance->supportVulkan()) {
+            MNN_ERROR("Invalide device for support vulkan\n");
+            return nullptr;
+        }
+        device = std::make_shared<VulkanDevice>(instance);
+    }
+    if (device->get() == VK_NULL_HANDLE) {
+        return nullptr;
     }
+    return new VulkanRuntime(info, device, instance);
+}
+
+VulkanRuntime::VulkanRuntime(const Backend::Info& info, std::shared_ptr<VulkanDevice> device, std::shared_ptr<VulkanInstance> instance) {
+    mInfo = info;
+    mDevice = device;
+    mInstance = instance;
     auto& dev              = *mDevice;
     mCmdPool               = std::make_shared<VulkanCommandPool>(dev);
     //GFlops, Test by mobilenet v1's ms
@@ -168,31 +186,11 @@ int VulkanRuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
     }
     return 0;
 }
-static bool _testVulkan() {
-    // std::make_unique need c++14
-    std::unique_ptr<VulkanInstance> instance(new VulkanInstance());
-    if (nullptr == instance) {
-        MNN_ERROR("Invalide device for support vulkan\n");
-        return false;
-    }
-    if (!instance->success()) {
-        MNN_ERROR("Invalide device for support vulkan\n");
-        return false;
-    }
-    if (!instance->supportVulkan()) {
-        MNN_ERROR("Invalide device for support vulkan\n");
-        return false;
-    }
-    return true;
-}
-
 class VulkanRuntimeCreator : public RuntimeCreator {
 public:
     virtual Runtime* onCreate(const Backend::Info& info) const {
         if (InitVulkan()) {
-            if (_testVulkan()) {
-                return new VulkanRuntime(info);
-            }
+            return VulkanRuntime::create(info);
         }
         return nullptr;
     }
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.hpp b/source/backend/vulkan/runtime/VulkanRuntime.hpp
index ab9edbab4..c8dfa56ac 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.hpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.hpp
@@ -24,7 +24,6 @@
 namespace MNN {
 class VulkanRuntime : public Runtime {
 public:
-    VulkanRuntime(const Backend::Info& info);
     virtual ~ VulkanRuntime();
 
     virtual Backend* onCreate(const BackendConfig* config) const override;
@@ -34,7 +33,9 @@ class VulkanRuntime : public Runtime {
     int onGetRuntimeStatus(RuntimeStatus statusEnum) const override;
     std::shared_ptr<VulkanBuffer> allocUniform(const void* src = nullptr, int size = 0);
     void recycleUniform(std::shared_ptr<VulkanBuffer> buffer);
+    static VulkanRuntime* create(const Backend::Info& info);
 private:
+    VulkanRuntime(const Backend::Info& info, std::shared_ptr<VulkanDevice> device, std::shared_ptr<VulkanInstance> instance);
     Backend::Info mInfo;
     std::shared_ptr<BufferAllocator> mBufferPool;
     std::shared_ptr<VulkanPipelineFactory> mPipelineFactory;
diff --git a/source/backend/vulkan/vulkan/vulkan_core.h b/source/backend/vulkan/vulkan/vulkan_core.h
index 228e4ef6e..67a14f7bb 100644
--- a/source/backend/vulkan/vulkan/vulkan_core.h
+++ b/source/backend/vulkan/vulkan/vulkan_core.h
@@ -2078,6 +2078,12 @@ typedef enum VkImageCreateFlagBits {
 } VkImageCreateFlagBits;
 typedef VkFlags VkImageCreateFlags;
 
+// Introduced from "vulkan_core.h" in Vulkan SDK Version 1.3.290.0.
+typedef enum VkInstanceCreateFlagBits {
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001
+} VkInstanceCreateFlagBits;
+typedef VkFlags VkInstanceCreateFlags;
+
 typedef enum VkSampleCountFlagBits {
     VK_SAMPLE_COUNT_1_BIT = 0x00000001,
     VK_SAMPLE_COUNT_2_BIT = 0x00000002,
@@ -9466,7 +9472,8 @@ typedef VkFormatFeatureFlagBits2 VkFormatFeatureFlagBits2KHR;
 
 typedef VkFormatProperties3 VkFormatProperties3KHR;
 
-
+// Introduced from "vulkan_core.h" in Vulkan SDK Version 1.3.290.0.
+#define VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME "VK_KHR_portability_enumeration"
 
 #define VK_KHR_maintenance4 1
 #define VK_KHR_MAINTENANCE_4_SPEC_VERSION 2
diff --git a/source/core/Backend.cpp b/source/core/Backend.cpp
index 50bda5066..c1d9a267b 100644
--- a/source/core/Backend.cpp
+++ b/source/core/Backend.cpp
@@ -69,22 +69,6 @@ void registerBackend() {
 #if MNN_METAL_ENABLED
         registerMetalRuntimeCreator();
 #endif
-        auto& gExtraCreator = GetExtraCreator();
-        for(auto iter = gExtraCreator.begin(); iter != gExtraCreator.end();){
-            if(!iter->second.second){
-                iter++;
-            }else{
-                Backend::Info info;
-                info.type = iter->first;
-                std::shared_ptr<Runtime> bn(iter->second.first->onCreate(info));
-                if (nullptr == bn.get()) {
-                    iter = gExtraCreator.erase(iter);
-                    MNN_ERROR("Error to use creator of %d, delete it\n", info.type);
-                }else{
-                    iter++;
-                }
-            }
-        }
     });
 }
 
diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp
index 2605047ec..0d199bd90 100644
--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@@ -39,6 +39,15 @@ struct RuntimeHint {
     // 2: Only quantize value cache, use fp8 quantization
     // 3: quantize both key and value cache as described above
     int kvcacheQuantOption = 0;
+    
+    // the kvcache size limit of each layer
+    // if the size of kvcache in memory exceeds the limit
+    // it will be moved to disk to save memory
+    // -1 for no limit
+    int kvcacheSizeLimit = -1;
+
+    // path of the kvcache directory
+    std::string kvcacheDirPath = "/tmp";
 };
 /** abstract backend */
 class Backend : public NonCopyable {
@@ -263,7 +272,7 @@ class Runtime : public NonCopyable {
     /**
      @brief reset runtime
      */
-    virtual void onReset(int numberThread, const BackendConfig* config) {
+    virtual void onReset(int numberThread, const BackendConfig* config, bool full) {
         // Do nothing
     }
 
diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp
index 1495bc7c1..43104da80 100644
--- a/source/core/BufferAllocator.cpp
+++ b/source/core/BufferAllocator.cpp
@@ -309,7 +309,7 @@ MemChunk DeferBufferAllocator::alloc(size_t size, bool separate, size_t align) {
         auto newChunk = createMemNode(size);
         insert_after(newChunk);
 #ifdef DUMP_USAGE
-    MNN_PRINT("Defer alloc: %p\n", newChunk);
+    MNN_PRINT("Defer alloc: %p, %d\n", newChunk, size);
 #endif
         return MemChunk(newChunk);
     }
@@ -332,7 +332,7 @@ MemChunk DeferBufferAllocator::alloc(size_t size, bool separate, size_t align) {
     // equal no change; small expand
     selectChunk->size = size;
 #ifdef DUMP_USAGE
-    MNN_PRINT("Defer alloc: %p\n", selectChunk);
+    MNN_PRINT("Defer alloc: %p, %d\n", selectChunk, size);
 #endif
     return MemChunk(selectChunk);
 }
diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp
index 2418bd211..970b17288 100644
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@@ -16,20 +16,57 @@
 
 namespace MNN {
 
-std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Convolution2D *conv, Backend* backend, bool forceFloat, bool forceInt8) {
+std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Op* op, Backend* backend, bool forceFloat, bool forceInt8) {
+    auto conv = op->main_as_Convolution2D();
     auto quan = conv->quanParameter();
     auto result = std::make_shared<Int8Common>();
     result->quan = quan;
     size_t buffer_size = 0, alpha_size = 0;
     const int8_t* buffer_ptr = nullptr;
     const float* alpha_ptr = nullptr;
-    if (quan->buffer()) {
-        buffer_size = quan->buffer()->size();
-        buffer_ptr = quan->buffer()->data();
-    }
-    if (quan->alpha()) {
-        alpha_size = quan->alpha()->size();
-        alpha_ptr = quan->alpha()->data();
+    std::unique_ptr<int8_t[]> external_buffer;
+    size_t weightLength = 0;
+    int8_t *buffer        = nullptr;
+    if (USE_EXTERNAL_DATA(conv) && op->externalPath() && quan->buffer() == nullptr) {
+        // external data
+        auto external_info = conv->external()->data();
+        std::unique_ptr<FileLoader> external_file(new FileLoader(op->externalPath()->c_str()));
+        external_file->offset(external_info[0]);
+        buffer_size = external_info[1];
+        if (0 != buffer_size) {
+            if (1 == quan->type() && !forceFloat) {
+                buffer = IDSTDecoder::ReadQuanData_c(external_file.get(), &weightLength, result.get(), quan->shapeInt32(), forceInt8);
+            } else {
+                external_buffer.reset(new int8_t[buffer_size]);
+                buffer_ptr = external_buffer.get();
+                external_file->read((char*)buffer_ptr, buffer_size);
+            }
+        }
+        alpha_size = external_info[2] / sizeof(float);
+        if (0 != alpha_size) {
+            result->alpha.reset(alpha_size);
+            if (nullptr == result->alpha.get()) {
+                MNN_PRINT("Alloc memory error for extract idst int8\n");
+                return nullptr;
+            }
+            alpha_ptr = result->alpha.get();
+            external_file->read((char*)alpha_ptr, alpha_size * sizeof(float));
+        }
+    } else {
+        if (quan->buffer()) {
+            buffer_size = quan->buffer()->size();
+            buffer_ptr = quan->buffer()->data();
+        }
+        if (quan->alpha()) {
+            alpha_size = quan->alpha()->size();
+            alpha_ptr = quan->alpha()->data();
+            result->alpha.reset(alpha_size);
+            if (nullptr == result->alpha.get()) {
+                MNN_PRINT("Alloc memory error for extract idst int8\n");
+                return nullptr;
+            }
+            ::memcpy(result->alpha.get(), alpha_ptr, alpha_size * sizeof(float));
+        }
     }
     if (quan->index() != nullptr) {
         if (forceFloat) {
@@ -51,16 +88,15 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
         } // Otherwise needn't treat, just return result with quan info
         return result;
     }
-    size_t weightLength = 0;
-    int8_t *buffer        = nullptr;
-    auto originBuffer     = (unsigned char *)buffer_ptr;
 
-    if (1 == quan->type()) {
-        buffer = IDSTDecoder::ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32());
+    std::unique_ptr<MemoryLoader> originBuffer(new MemoryLoader((unsigned char*)buffer_ptr));
+    if (1 == quan->type() && weightLength == 0) {
+        buffer = IDSTDecoder::ReadQuanData_c(originBuffer.get(), &weightLength, result.get(), quan->shapeInt32(), forceInt8);
     }
     if (2 == quan->type()) {
-        buffer = IDSTDecoder::ReadSparseQuanData_c(originBuffer, &weightLength, alpha_ptr, alpha_size, result.get(), quan->shapeInt32());
+        buffer = IDSTDecoder::ReadSparseQuanData_c(originBuffer.get(), &weightLength, alpha_ptr, alpha_size, result.get(), quan->shapeInt32());
     }
+    /*
     if (result->weightMap.size() > 0) {
         result->canUseInt4 = true;
         for (auto value : result->weightMap) {
@@ -69,6 +105,7 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
             }
         }
     }
+    */
     // read fp16 data
     if (3 == quan->type()) {
         weightLength = buffer_size / sizeof(half_float::half);
@@ -99,12 +136,6 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
         }
         result->weight.set(buffer, weightLength);
     }
-    result->alpha.reset(alpha_size);
-    if (nullptr == result->alpha.get()) {
-        MNN_PRINT("Alloc memory error for extract idst int8\n");
-        return nullptr;
-    }
-    ::memcpy(result->alpha.get(), alpha_ptr, alpha_size * sizeof(float));
     {
         int outputCount = 0;
         bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6);
@@ -128,9 +159,10 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
             // for old type 4 models, their quan->quantScale is 0. which will introduce a bug here
             if (oldType4) {
                 extraFactor = 1.0f;
-            }
-            for (int o=0; o<result->alpha.size(); ++o) {
-                result->alpha.get()[o] *= extraFactor;
+            } else if (extraFactor != 1.0f) {
+                for (int o=0; o<result->alpha.size(); ++o) {
+                    result->alpha.get()[o] *= extraFactor;
+                }
             }
         }
     }
@@ -172,12 +204,13 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Con
     return result;
 }
 
-void ConvolutionCommon::getConvParameters(std::shared_ptr<Int8Common> *quanCommon, Backend* backend, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize) {
+void ConvolutionCommon::getConvParameters(std::shared_ptr<Int8Common> *quanCommon, Backend* backend, const MNN::Op *op, const float** originWeight, int* originWeightSize) {
+    auto conv2d = op->main_as_Convolution2D();
     *originWeight = nullptr;
     *originWeightSize = 0;
     if (nullptr != conv2d->quanParameter()) {
         bool forceFloat = conv2d->quanParameter()->index() != nullptr;
-        *quanCommon = load(conv2d, backend, forceFloat);
+        *quanCommon = load(op, backend, forceFloat);
         *originWeight     = (*quanCommon)->weightFloat.get();
         *originWeightSize = (*quanCommon)->weightFloat.size();
     }
@@ -187,8 +220,9 @@ void ConvolutionCommon::getConvParameters(std::shared_ptr<Int8Common> *quanCommo
     }
 }
 
-bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr<Int8Common>& quanCommon, Backend* backend,
+bool ConvolutionCommon::getConvInt8Parameters(const MNN::Op* op, std::shared_ptr<Int8Common>& quanCommon, Backend* backend,
                                               const int8_t*& weight, int& weightSize, float*& scale, int32_t*& bias, int32_t*& weightQuantZeroPoint) {
+    auto conv2d = op->main_as_Convolution2D();
     int outputCount = conv2d->common()->outputCount();
     weightSize = 0;
     auto core = static_cast<CPUBackend*>(backend)->functions();
@@ -197,8 +231,8 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
         weight = conv2d->symmetricQuan()->weight()->data();
         weightSize = conv2d->symmetricQuan()->weight()->size();
     }
-    if (conv2d->quanParameter() && conv2d->quanParameter()->buffer()) { // int8 weight
-        quanCommon = ConvolutionCommon::load(conv2d, backend, false, true);
+    if (conv2d->quanParameter() && (conv2d->quanParameter()->buffer() || conv2d->external())) { // int8 weight
+        quanCommon = ConvolutionCommon::load(op, backend, false, true);
         MNN_ASSERT(quanCommon != nullptr);
         weight = quanCommon->weight.get();
         weightSize = quanCommon->weight.size();
@@ -211,6 +245,7 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d,
     if (quanCommon && quanCommon->asymmetric) {
         weightAsy = true;
     }
+
     if (conv2d->symmetricQuan() && conv2d->symmetricQuan()->bias() && conv2d->symmetricQuan()->scale()) {
         // Compability for old model
         MNN_ASSERT(conv2d->symmetricQuan()->bias()->size() == outputCount && conv2d->symmetricQuan()->scale()->size() == outputCount);
diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp
index 28e3acf83..7b1bbd5f0 100644
--- a/source/core/ConvolutionCommon.hpp
+++ b/source/core/ConvolutionCommon.hpp
@@ -24,9 +24,9 @@ class MNN_PUBLIC ConvolutionCommon : public Execution {
         bool canUseInt4 = false;
         Backend* backend = nullptr;
     };
-    static std::shared_ptr<Int8Common> load(const Convolution2D* conv, Backend* backend = nullptr, bool forceFloat = false, bool forceInt8 = false);
-    static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, Backend* backend, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize);
-    static bool getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr<Int8Common>& quanCommon, Backend* backend,
+    static std::shared_ptr<Int8Common> load(const Op* op, Backend* backend = nullptr, bool forceFloat = false, bool forceInt8 = false);
+    static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, Backend* backend, const MNN::Op *op, const float** originWeight, int* originWeightSize);
+    static bool getConvInt8Parameters(const MNN::Op* op, std::shared_ptr<Int8Common>& quanCommon, Backend* backend,
                                       const int8_t*& weight, int& weightSize, float*& scale, int32_t*& bias, int32_t*& weightQuantZero);
 
     // Return padX, padY
diff --git a/source/core/FileLoader.hpp b/source/core/FileLoader.hpp
index 70fdddfd0..46e8036b8 100644
--- a/source/core/FileLoader.hpp
+++ b/source/core/FileLoader.hpp
@@ -13,14 +13,22 @@
 
 #include "core/AutoStorage.h"
 namespace MNN {
-class MNN_PUBLIC FileLoader {
+
+class BaseLoader {
+public:
+    BaseLoader() = default;
+    virtual ~BaseLoader() = default;
+    virtual bool read(char* buffer, int64_t size) = 0;
+};
+
+class MNN_PUBLIC FileLoader : public BaseLoader {
 public:
     FileLoader(const char* file, bool init = false);
 
     ~FileLoader();
 
     bool read();
-    
+
     static bool write(const char* filePath, std::pair<const void*, size_t> cacheInfo);
 
     bool valid() const {
@@ -29,6 +37,9 @@ class MNN_PUBLIC FileLoader {
     inline size_t size() const {
         return mTotalSize;
     }
+    inline std::string path() const {
+        return mFilePath;
+    }
 
     bool merge(AutoStorage<uint8_t>& buffer);
 
@@ -44,5 +55,17 @@ class MNN_PUBLIC FileLoader {
     std::string mFilePath;
     bool mInited = false;
 };
+
+class MemoryLoader : public BaseLoader {
+public:
+    MemoryLoader(unsigned char* ptr) : buffer_(ptr) {}
+    virtual bool read(char *dst, int64_t size) override {
+        ::memcpy(dst, buffer_, size);
+        buffer_ += size;
+        return true;
+    }
+private:
+    unsigned char* buffer_ = nullptr;
+};
 } // namespace MNN
 #endif
diff --git a/source/core/IDSTDecoder.hpp b/source/core/IDSTDecoder.hpp
index 679e92fcc..757fdbf4d 100644
--- a/source/core/IDSTDecoder.hpp
+++ b/source/core/IDSTDecoder.hpp
@@ -12,6 +12,7 @@
 #include <map>
 #include <cmath>
 #include "MNN_generated.h"
+#include "core/FileLoader.hpp"
 #include "core/ConvolutionCommon.hpp"
 
 using namespace MNN;
@@ -22,9 +23,9 @@ static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
     return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
 }
 
-static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
-    int uSize = myfile[0];
-    myfile++;
+static int ReadBlobDim(BaseLoader* myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
+    uint8_t uSize = 0;
+    myfile->read((char*)&uSize, 1);
     if (uSize > 4) {
         printf("Read shape error!\n");
         return 0;
@@ -34,14 +35,13 @@ static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBuf
         copyLength = shapeBufCnt;
     }
     if (useInt32) {
-        ::memcpy(shape, myfile, sizeof(unsigned int) * copyLength);
-        myfile += copyLength * sizeof(unsigned int);
+        myfile->read((char*)shape, sizeof(unsigned int) * copyLength);
     } else {
-        auto myfileint16 = (uint16_t*)myfile;
-        for (int i=0; i<copyLength; ++i) {
-            shape[i] = myfileint16[i];
+        uint16_t shape_i16[32] = {0};
+        myfile->read((char*)shape_i16, sizeof(uint16_t) * copyLength);
+        for (int i = 0; i < copyLength; ++i) {
+            shape[i] = shape_i16[i];
         }
-        myfile += copyLength * sizeof(unsigned short);
     }
     return copyLength;
 }
@@ -188,11 +188,6 @@ static int8_t FindInMap(PSIMPLE_MAP map, int8_t k, int *found) {
     return 0;
 }
 
-static void StreamSizeRead(void *dst, int unit, size_t count, unsigned char *&file) {
-    ::memcpy(dst, file, unit * count);
-    file += (unit * count);
-}
-
 static bool isLinearSample(const std::vector<int8_t>& sample, int bit) {
     const int offset = 1 << (bit - 1);
     const int size = 1 << bit;
@@ -207,16 +202,16 @@ static bool isLinearSample(const std::vector<int8_t>& sample, int bit) {
     return true;
 }
 
-static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32) {
+static int8_t *ReadQuanData_c(BaseLoader* s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32, bool forceQuant) {
     int8_t *blob      = nullptr;
     uint8_t *idxBuf   = nullptr;
     uint8_t *idxBytes = nullptr;
-    uint32_t dataCnt  = 1;
+    size_t dataCnt  = 1;
 
     do {
         // blob shape
         unsigned int shape[32] = {0};
-        uint32_t shapeDim        = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
+        uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
         if (shapeDim == 0 || shapeDim > 32)
             break;
         for (uint32_t i = 0; i < shapeDim; i++)
@@ -224,7 +219,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
 
         // sample
         uint32_t sampleCnt = 0;
-        StreamSizeRead(&sampleCnt, 1, 1, s);
+        s->read((char*)&sampleCnt, 1);
         if (sampleCnt == 0) {
             sampleCnt = 256;
         }
@@ -232,7 +227,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
         auto samples = result->weightMap.data();
         if (samples == nullptr)
             break;
-        StreamSizeRead(samples, 1, sampleCnt, s);
+        s->read((char*)samples, sampleCnt);
         SimpleRank(samples, sampleCnt, 1);
         uint32_t idxBitsCnt = atLestBitsCnt(sampleCnt);
         idxBitsCnt = idxBitsCnt < 1 ? 1 : idxBitsCnt;
@@ -243,18 +238,16 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
             MNN_ERROR("Not enought memory\n");
             break;
         }
-        StreamSizeRead(idxBuf, 1, idxBufSize, s);
+        s->read((char*)idxBuf, idxBufSize);
         if (idxBitsCnt == 4) {
             dataCnt = UP_DIV(dataCnt, 2) * 2;
         }
-        blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt);
-        if (nullptr == blob) {
-            break;
-        }
 
         if (isLinearSample(result->weightMap, idxBitsCnt) && (idxBitsCnt == 4 || idxBitsCnt == 8)) {
-            // fast sample for bit = 4 or 8
-            if (idxBitsCnt == 4) {
+            if (!forceQuant && idxBitsCnt == 4) {
+                // back to float, 4bit to 8bit
+                *len = dataCnt;
+                blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt);
                 for (int i = 0; i < idxBufSize; i++) {
                     int val = idxBuf[i];
                     int x1 = val / 16;
@@ -262,14 +255,24 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
                     blob[2 * i] = x1 - 8;
                     blob[2 * i + 1] = x2 - 8;
                 }
-            }
-            if (idxBitsCnt == 8) {
-                for (int i = 0; i < idxBufSize; i++) {
-                    int val = idxBuf[i];
-                    blob[i] = val - 128;
+            } else {
+                // keep quant
+                blob = (int8_t*)idxBuf;
+                idxBuf = nullptr;
+                if (idxBitsCnt == 4) {
+                    result->canUseInt4 = true;
+                } else {
+                    for (int i = 0; i < idxBufSize; i++) {
+                        blob[i] = (int)blob[i] - 128;
+                    }
                 }
+                *len = idxBufSize;
             }
         } else {
+            blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt);
+            if (nullptr == blob) {
+                break;
+            }
             // split index value into bytes
             idxBytes = (uint8_t *)MNNMemoryAllocAlignZeroAlign(dataCnt * sizeof(uint8_t));
             if (idxBitsCnt == 0 || nullptr == idxBytes) {
@@ -292,6 +295,8 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
             }
             MNNMemoryFreeAlign(idxBytes);
             idxBytes = nullptr;
+            if (len)
+                *len = blob ? dataCnt : 0;
         }
     } while (0);
 
@@ -299,12 +304,11 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon:
         MNNMemoryFreeAlign(idxBuf);
     if (idxBytes != nullptr)
         MNNMemoryFreeAlign(idxBytes);
-    if (len)
-        *len = blob ? dataCnt : 0;
+
     return blob;
 }
 
-static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) {    // MNN_ERROR("sparse:%d\n", 1);
+static int8_t *ReadSparseQuanData_c(BaseLoader* myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) {    // MNN_ERROR("sparse:%d\n", 1);
     unsigned int shape[32];
     uint32_t ucMapSize = 0;
     PSIMPLE_SET setWeight = CreateSimpleSet(256);
@@ -324,9 +328,9 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f
     if (blob == nullptr)
         return nullptr;
     // 2. nnz
-    StreamSizeRead(&nnz, 4, 1, myfile);
+    myfile->read((char *)&nnz, 4);
     // 3. max_step use # bits () (unsigned char)
-    StreamSizeRead(&iIdxNeedBits, 1, 1, myfile);
+    myfile->read((char *)&iIdxNeedBits, 1);
     // read idx array
     // 4. buf for steps ceil(nnz*step need bits/8)
     AutoStorage<unsigned char> arrIdxBuffer(nnz);
@@ -340,12 +344,12 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f
         if (nullptr == buf) {
             return nullptr;
         }
-        StreamSizeRead(buf, 1, bufLen, myfile);
+        myfile->read((char *)buf, bufLen);
         SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrIdx, (uint32_t)nnz, (uint32_t)iIdxNeedBits);
         MNNMemoryFreeAlign(buf);
     }
     // 5. Avalable values Count(unsigned char)
-    StreamSizeRead(&ucMapSize, 1, 1, myfile);
+    myfile->read((char *)&ucMapSize, 1);
     if (0 == ucMapSize) {
         ucMapSize = 256;
     }
@@ -353,7 +357,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f
     // 6. valueset(signed char * valueset_size)
     for (int i = 0; i < ucMapSize; i++) {
         int8_t tmp;
-        StreamSizeRead(&tmp, 1, 1, myfile);
+        myfile->read((char *)&tmp, 1);
         InsertSimpleSet(setWeight, tmp);
         result->weightMap[i] = tmp;
     }
@@ -383,7 +387,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f
         if (nullptr == buf) {
             return nullptr;
         }
-        StreamSizeRead(buf, 1, bufLen, myfile);
+        myfile->read((char *)buf, bufLen);
         SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrWeightIdx, (uint32_t)nnz,
                         (uint32_t)iDataNeedBits);
         MNNMemoryFreeAlign(buf);
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index 127bd6e52..6620e0045 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -41,7 +41,7 @@ struct Content {
     std::string uuid;
     std::string externalFile;
 #ifdef MNN_INTERNAL_ENABLED
-    std::map<std::string, std::string> basicLogginData;
+    std::string version;
     std::map<const Session*, std::tuple<int, int>> sessionInfo;
 #endif
 };
@@ -221,8 +221,7 @@ Interpreter::Interpreter(Content* net) {
     mNet->bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : "");
     mNet->uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : "");
 #ifdef MNN_INTERNAL_ENABLED
-    mNet->basicLogginData = logBasicInfo();
-    mNet->basicLogginData.emplace("ModelVersion", getModelVersion());
+    mNet->version = getModelVersion();
 #endif
 }
 
@@ -329,7 +328,8 @@ Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>&
     int mode = configs[0].mode;
     mNet->sessionInfo.insert(std::make_pair(result, std::make_tuple(precision, mode)));
     if (shouldLog(FREQ_HIGH)) {
-        std::map<std::string, std::string> metrics = mNet->basicLogginData;
+        std::map<std::string, std::string> metrics = logBasicInfo();
+        metrics.emplace("ModelVersion", mNet->version);
         metrics.emplace("UUID", mNet->uuid);
         metrics.emplace("Time", std::to_string((float)_timer.durationInUs() / 1024.0f));
         metrics.emplace("Backend", std::to_string(configs[0].type));
@@ -383,7 +383,8 @@ void Interpreter::logForRunSession(const Session* session, float timeInMs, const
     session->getInfo(MNN::Interpreter::FLOPS, &flops);
     float memory = 0.0f;
     session->getInfo(MNN::Interpreter::MEMORY, &memory);
-    std::map<std::string, std::string> metrics = mNet->basicLogginData;
+    std::map<std::string, std::string> metrics = logBasicInfo();
+    metrics.emplace("ModelVersion", mNet->version);
     metrics.emplace("UUID", mNet->uuid);
     metrics.emplace("Backend", std::to_string(backendType[0])); // "Precision" is not logged here. Don't need it.
     metrics.emplace("Time", std::to_string(timeInMs));
diff --git a/source/core/MNNFileUtils.cpp b/source/core/MNNFileUtils.cpp
new file mode 100644
index 000000000..445ec2bb3
--- /dev/null
+++ b/source/core/MNNFileUtils.cpp
@@ -0,0 +1,284 @@
+//
+//  MNNFileUtils.cpp
+//  MNN
+//
+//  Created by MNN on 2024/07/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "MNNFileUtils.h"
+
+std::string MNNFilePathConcat(std::string prefix, std::string suffix) {
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    return prefix + "\\" + suffix;
+#else
+    return prefix + "/" + suffix;
+#endif
+}
+
+bool MNNDirExist(const char * path) {
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    DWORD file_attributes = GetFileAttributes(path);
+    return (file_attributes != INVALID_FILE_ATTRIBUTES) && (file_attributes & FILE_ATTRIBUTE_DIRECTORY);
+#else
+    struct stat info;
+    return (stat(path, &info) == 0) && (info.st_mode & S_IFDIR);
+#endif
+}
+
+bool MNNFileExist(const char * file_name)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    return _access(file_name, 0) == 0;
+#else
+    return access(file_name, F_OK) == 0;
+#endif
+}
+
+file_t MNNCreateFile(const char * file_name)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    HANDLE hd = CreateFile(
+        file_name,                      // File Name
+        GENERIC_READ | GENERIC_WRITE,   // Read and Write
+        0,                              // No Sharing
+        NULL,                           // No Security
+        CREATE_ALWAYS,                  // Create the file and cover the existing file
+        FILE_ATTRIBUTE_NORMAL,          // Normal Attribute
+        NULL                            // No Template
+    );
+    if (hd == INVALID_HANDLE_VALUE) {
+        printf("Failed to create the file: %s\n", file_name);
+        return INVALID_FILE;
+    }
+    return hd;
+#else
+    int fd = open(
+        file_name,                      // File Name
+        O_RDWR | O_CREAT | O_TRUNC,     // Read and Write and Create the file and cover existing file
+        0666                            // Read and Write Permission for Everyone
+    );
+    if (fd == -1) {
+        printf("Failed to create the file: %s\n", file_name);
+        return INVALID_FILE;
+    }
+    return fd;
+#endif
+}
+
+file_t MNNOpenFile(const char * file_name, uint32_t flags)
+{
+    if (!MNNFileExist(file_name)) {
+        return INVALID_FILE;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    DWORD mode = 0;
+    if (flags & MNN_FILE_READ) {
+        mode |= GENERIC_READ;
+    }
+    if (flags & MNN_FILE_WRITE) {
+        mode |= GENERIC_WRITE;
+    }
+    HANDLE hd = CreateFile(
+        file_name,              // File Name
+        mode,                   // Opening Mode
+        0,                      // No Sharing
+        NULL,                   // No Security
+        OPEN_EXISTING,          // Only Open Existing File
+        FILE_ATTRIBUTE_NORMAL,  // Normal Attribute
+        NULL                    // No Template
+    );
+    if (hd == INVALID_HANDLE_VALUE) {
+        printf("Failed to open the file: %s\n", file_name);
+        return INVALID_FILE;
+    }
+    return hd;
+#else
+    int mode = 0;
+    if (flags & MNN_FILE_READ) {
+        mode = O_RDONLY;
+    }
+    if (flags & MNN_FILE_WRITE) {
+        mode = O_RDWR;
+    }
+    int fd = open(file_name, mode);
+    if (fd == -1) {
+        printf("Failed to open the file: %s\n", file_name);
+        return INVALID_FILE;
+    }
+    return fd;
+#endif
+}
+
+ErrorCode MNNCloseFile(file_t file)
+{
+    if (file == INVALID_FILE) {
+        return FILE_NOT_EXIST;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    if (!CloseHandle(file)) {
+        return FILE_CLOSE_FAILED;
+    }
+#else
+    if (-1 == close(file)) {
+        return FILE_CLOSE_FAILED;
+    }
+#endif
+    return NO_ERROR;
+}
+
+ErrorCode MNNRemoveFile(const char * file_name)
+{
+    if (!MNNFileExist(file_name)) {
+        return FILE_NOT_EXIST;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    if (!DeleteFile(file_name)) {
+        return FILE_REMOVE_FAILED;
+    }
+#else
+    if (-1 == unlink(file_name)) {
+        return FILE_REMOVE_FAILED;
+    }
+#endif
+    return NO_ERROR;
+}
+
+size_t MNNGetFileSize(file_t file)
+{
+    if (file == INVALID_FILE) {
+        return INVALID_SIZE;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    LARGE_INTEGER fileSize;
+    if (!GetFileSizeEx(file, &fileSize)) {
+        return (size_t)(-1);
+    } else {
+        return (size_t)(fileSize.QuadPart);
+    }
+#else
+    struct stat file_stat;
+    if (fstat(file, &file_stat) == -1) {
+        return (size_t)(-1);
+    } else {
+        return file_stat.st_size;
+    }
+#endif
+}
+
+ErrorCode MNNSetFileSize(file_t file, size_t aimed_size)
+{
+    if (file == INVALID_FILE) {
+        return FILE_NOT_EXIST;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    LARGE_INTEGER size;
+    size.QuadPart = aimed_size;
+    bool success = SetFilePointerEx(file, size, NULL, FILE_BEGIN);
+    if (!success) {
+        return FILE_RESIZE_FAILED;
+    }
+    success = SetEndOfFile(file);
+    if (!success) {
+        return FILE_RESIZE_FAILED;
+    }
+    return NO_ERROR;
+#else
+    if (-1 == ftruncate(file, aimed_size)) {
+        return FILE_RESIZE_FAILED;
+    }
+    return NO_ERROR;
+#endif
+}
+
+size_t MNNReadFile(file_t file, void * buf, size_t bytes)
+{
+    if (file == INVALID_FILE || buf == nullptr) {
+        return 0;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    DWORD readbytes = 0;
+    if (ReadFile(file, buf, bytes, &readbytes, NULL)) {
+        return readbytes;
+    } else {
+        return 0;
+    }
+#else
+    return read(file, buf, bytes);
+#endif
+}
+
+size_t MNNWriteFile(file_t file, void * buf, size_t bytes)
+{
+    if (file == INVALID_FILE || buf == nullptr) {
+        return 0;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    DWORD writebytes = 0;
+    if (WriteFile(file, buf, bytes, &writebytes, NULL)) {
+        return writebytes;
+    } else {
+        return 0;
+    }
+#else
+    return write(file, buf, bytes);
+#endif
+}
+
+ErrorCode MNNSetFilePointer(file_t file, size_t offset)
+{
+    if (file == INVALID_FILE) {
+        return FILE_NOT_EXIST;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    LARGE_INTEGER liDistanceToMove;
+    liDistanceToMove.QuadPart = offset;
+    if (SetFilePointerEx(file, liDistanceToMove, NULL, FILE_BEGIN)) {
+        return NO_ERROR;
+    } else {
+        return FILE_SEEK_FAILED;
+    }
+#else
+    if (-1 == lseek(file, offset, SEEK_SET)) {
+        return FILE_SEEK_FAILED;
+    } else {
+        return NO_ERROR;
+    }
+#endif
+}
+
+void * MNNMmapFile(file_t file, size_t size)
+{
+    if (file == INVALID_FILE || MNNGetFileSize(file) < size) {
+        return nullptr;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    HANDLE hFileMapping = CreateFileMapping(file, NULL, PAGE_READWRITE, (size >> 32) & 0xffffffff, size & 0xffffffff, NULL);
+    if (hFileMapping == NULL) {
+        return nullptr;
+    }
+    void * addr = MapViewOfFile(hFileMapping, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, size);
+    CloseHandle(hFileMapping);
+    return addr;
+#else
+    void * addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, file, 0);
+    if (addr == MAP_FAILED) {
+        return nullptr;
+    }
+    return addr;
+#endif
+}
+
+ErrorCode MNNUnmapFile(void * addr, size_t size)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    if (!UnmapViewOfFile(addr)) {
+        return FILE_UNMAP_FAILED;
+    }
+#else
+    if (-1 == munmap(addr, size)) {
+        return FILE_UNMAP_FAILED;
+    }
+#endif
+    return NO_ERROR;
+}
\ No newline at end of file
diff --git a/source/core/MNNFileUtils.h b/source/core/MNNFileUtils.h
new file mode 100644
index 000000000..a3ecb4be8
--- /dev/null
+++ b/source/core/MNNFileUtils.h
@@ -0,0 +1,182 @@
+//
+//  MNNFileUtils.h
+//  MNN
+//
+//  Created by MNN on 2024/07/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_FileUtils_H
+#define MNN_FileUtils_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string>
+#include "core/Macro.h"
+#include "MNN/ErrorCode.hpp"
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+#include <windows.h>
+#include <io.h>
+#undef max
+#undef min
+#undef NO_ERROR
+#else
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#endif
+
+using namespace MNN;
+
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+    typedef HANDLE file_t;
+    const file_t INVALID_FILE = INVALID_HANDLE_VALUE;
+#else
+    typedef int file_t;
+    const file_t INVALID_FILE = -1;
+#endif
+
+#define MNN_FILE_READ   1U
+#define MNN_FILE_WRITE  2U
+#define INVALID_SIZE    ((size_t)(-1))
+
+/*=============================================================================================
+**  @brief      Concat a file name with a directory path
+**  @hint       This function can be called multiple times to concat multi-level paths
+*/
+MNN_PUBLIC std::string MNNFilePathConcat(std::string prefix, std::string suffix);
+
+/*=============================================================================================
+**  @brief      Check whether a directory exists
+**  @param      path -- path of the directory
+**  @return     If the directory exists, returns true
+**              If the directory does not exist, return false
+*/
+MNN_PUBLIC bool MNNDirExist(const char * path);
+
+/*=============================================================================================
+**  @brief      Check whether a file exists
+**  @param      file_name -- path of the file
+**  @return     If the file exists, returns true
+**              If the file does not exist, return false
+*/
+MNN_PUBLIC bool MNNFileExist(const char * file_name);
+
+/*=============================================================================================
+**  @brief      Create a file
+**  @param      file_name -- path of the file
+**  @return     If succeeded, returns the handle of the created file in the read and write mode
+**              If failed, returns INVALID_FILE
+**  @warning    If the file exists already, it will be covered
+**              Size of the newly created file will be 0
+*/
+MNN_PUBLIC file_t MNNCreateFile(const char * file_name);
+
+/*=============================================================================================
+**  @brief      Open a file
+**  @param      file_name -- path of the file
+**              flags -- openning mode (MNN_FILE_READ or MNN_FILE_WRITE or both)
+**  @return     If succeeded, returns the handle of the file
+**              If failed, returns INVALID_FILE
+**  @warning    If the file does not exist, this function would fail
+**              Make sure that the aimed file has been created by MNNCreateFile()
+*/
+MNN_PUBLIC file_t MNNOpenFile(const char * file_name, uint32_t flags);
+
+/*=============================================================================================
+**  @brief      Close a file
+**  @param      file -- handle of the file
+**  @return     If succeeded, returns NO_ERROR
+**              If failed, returns FAILED
+**  @warning    Close an INVALID_FILE would fail
+**              Make sure that the aimed file has been opened by MNNOpenFile()
+*/
+MNN_PUBLIC ErrorCode MNNCloseFile(file_t file);
+
+/*=============================================================================================
+**  @brief      Remove a file
+**  @param      file_name -- path of the file
+**  @return     If succeeded, returns NO_ERROR
+**              If failed, returns FAILED
+**  @warning    If the file does not exist, this function would fail
+*/
+MNN_PUBLIC ErrorCode MNNRemoveFile(const char * file_name);
+
+/*=============================================================================================
+**  @brief      Get the size of a file
+**  @param      file -- handle of the file
+**  @return     size of the file or INVALID_SIZE for INVALID_FILE
+*/
+MNN_PUBLIC size_t MNNGetFileSize(file_t file);
+
+/*=============================================================================================
+**  @brief      Resize a file
+**  @param      file -- handle of the file
+**              aimed_size -- the aimed size of this file
+**  @return     If succeeded, returns NO_ERROR
+**              If failed, returns FAILED
+**  @warning    resize an INVALID_FILE would fail
+*/
+MNN_PUBLIC ErrorCode MNNSetFileSize(file_t file, size_t aimed_size);
+
+/*=============================================================================================
+**  @brief      Read from the file to the buffer
+**  @param      file  -- handle of the file
+**              buf   -- start address of the buffer in memory
+**              bytes -- number of bytes to be read
+**  @return     how many bytes have been read actually
+**  @warning    Make sure that space of the buffer is enough
+**              Otherwise, this function may access out of bounds
+*/
+MNN_PUBLIC size_t MNNReadFile(file_t file, void * buf, size_t bytes);
+
+/*=============================================================================================
+**  @brief      Write to the file from the buffer
+**  @param      file  -- handle of the file
+**              buf   -- start address of the buffer in memory
+**              bytes -- number of bytes to be written
+**  @return     how many bytes have been written actually
+**  @warning    Make sure the data in the buffer is enough
+**              Otherwise, this function may access out of bounds
+*/
+MNN_PUBLIC size_t MNNWriteFile(file_t file, void * buf, size_t bytes);
+
+/*=============================================================================================
+**  @brief      Set the file pointer to a given position
+**  @param      file   -- handle of the file
+**              offset -- the aimed postion from the start of the file
+**  @return     If succeeded, returns NO_ERROR
+**              If failed, returns FAILED
+**  @warning    Make sure the offset not exceeding the file size
+*/
+MNN_PUBLIC ErrorCode MNNSetFilePointer(file_t file, size_t offset);
+
+/*=============================================================================================
+**  @brief      Memory-map the file to the virtual address space of the current process
+**  @param      file -- handle of the file
+**              size -- mapped length
+**  @return     If succeeded, returns the start address of the mapped space
+**              If failed, return nullptr
+**  @hint       Memory-mapping a file to the virtual address space enables the process to access it by pointers
+**              After the memory-mapping, the user can simply treat the mapped space as a memory buffer 
+**              Read from or write to the mapped space will trigger data swapping
+**              between the file on disk and the kernel page cache in memory
+**              which is managed by the OS kernel and is transparent to the user
+**  @warning    Make sure that the mapped size is no larger than the size of the file
+**              Especially when mapping a newly created file, whose size is 0
+*/
+MNN_PUBLIC void * MNNMmapFile(file_t file, size_t size);
+
+/*=============================================================================================
+**  @brief      Unmap a previously mapped memory space
+**  @param      addr -- start address of the mapped space
+**              size -- mapped length
+**  @return     If succeeded, returns NO_ERROR
+**              If failed, returns FAILED
+**  @warning    Make sure that this space was mapped by the MNNMmapFile() before
+**              and the size is correct
+*/
+MNN_PUBLIC ErrorCode MNNUnmapFile(void * addr, size_t size);
+
+#endif // MNN_FileUtils_H
\ No newline at end of file
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index f5e385605..8c5596312 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -619,8 +619,8 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat
         {
             auto layer_norm_param = op->main.AsLayerNorm();
             int32_t size = static_cast<int32_t>(layer_norm_param->external[1]);
-            layer_norm_param->gamma.resize(size);
-            layer_norm_param->beta.resize(size);
+            layer_norm_param->gamma.resize(size / sizeof(float));
+            layer_norm_param->beta.resize(size / sizeof(float));
             external->offset(layer_norm_param->external[0]);
             external->read((char*)layer_norm_param->gamma.data(), layer_norm_param->external[1]);
             external->read((char*)layer_norm_param->beta.data(), layer_norm_param->external[2]);
@@ -631,13 +631,21 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat
         {
             auto param = op->main.AsConvolution2D();
             if (param->quanParameter) {
-                external->offset(param->external[0]);
-                if (0 != param->external[1]) {
-                    param->quanParameter->buffer.resize(param->external[1]);
-                    external->read((char*)param->quanParameter->buffer.data(), param->external[1]);
+                bool isSparse = param->sparseParameter.get() != nullptr;
+                bool isPTQ = param->quanParameter->scaleIn != 0;
+                if (isSparse || isPTQ) {
+                    external->offset(param->external[0]);
+                    if (0 != param->external[1]) {
+                        param->quanParameter->buffer.resize(param->external[1]);
+                        external->read((char*)param->quanParameter->buffer.data(), param->external[1]);
+                    }
+                    param->quanParameter->alpha.resize(param->external[2] / sizeof(float));
+                    external->read((char*)param->quanParameter->alpha.data(), param->external[2]);
+                } else {
+                    // skip weight and dequant alpha for load speed
+                    op->externalPath = external->path();
+                    external->offset(param->external[0] + param->external[1] + param->external[2]);
                 }
-                param->quanParameter->alpha.resize(param->external[2] / sizeof(float));
-                external->read((char*)param->quanParameter->alpha.data(), param->external[2]);
                 if (param->bias.empty() && param->external.size() > 3) {
                     param->bias.resize(param->external[3]/sizeof(float));
                     external->read((char*)param->bias.data(), param->external[3]);
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 2798239e5..553b964e2 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -270,6 +270,7 @@ ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) {
     } else {
 #ifndef MNN_BUILD_MINI
         mContext.clear();
+        mContext.mNeedRelease = mGeometryNeedRelease;
         FileLoader l(mExternalFile.c_str());
         /** Size Compute and compute Const Begin */
         auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(&l, mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen);
@@ -877,12 +878,17 @@ void Pipeline::_recycleDynamicMemory(Command* command) {
     }
 }
 void Pipeline::openResizeCheck() {
+#ifndef MNN_BUILD_MINI
+    mGeometryNeedRelease = false;
     for (auto& info : mInfo.second) {
         info.computeCache.open();
     }
+#endif
 }
 
 ErrorCode Pipeline::fixResizeCache() {
+#ifndef MNN_BUILD_MINI
+    // TODO: Recompute release mask and set mGeometryNeedRelease = true
     for (auto& info : mInfo.second) {
         if (info.type == Schedule::CONSTANT && (!info.computeCache.needExecuteConst)) {
             info.executeBuffer.command.clear();
@@ -895,6 +901,7 @@ ErrorCode Pipeline::fixResizeCache() {
     res = res && mInfo.first.cache.second->onSelectDynamicAllocator(1, 2);
     if (!res) {
         MNN_PRINT("%d backend don't support resize fix optimize\n", mInfo.first.cache.first->type());
+        mGeometryNeedRelease = true;
         return NOT_SUPPORT;
     }
     size_t totalNumber = 0;
@@ -946,6 +953,7 @@ ErrorCode Pipeline::fixResizeCache() {
     mInfo.first.cache.first->onSelectDynamicAllocator(0, 2);
     res && mInfo.first.cache.second->onSelectDynamicAllocator(0, 2);
     MNN_PRINT("Fix: %d - Total: %d, rate = %f\n", fixNumber, totalNumber, (float)fixNumber / (float)totalNumber);
+#endif
     return NO_ERROR;
 }
 ErrorCode Pipeline::_allocForTensor(int index, bool allocInput) {
@@ -1070,28 +1078,6 @@ ErrorCode Pipeline::_allocForTensor(int index, bool allocInput) {
 ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
     // MNN_PRINT("allocMemory mtype:%d, cpubackendType:%d, cpuBackend runtime:%p\n", mBackend->type(), mBackupBackend->type(), mBackupBackend->getRuntime());
     if (!firstMalloc) {
-        // For session setNeedMalloc, if session's output is set as some input, It may cause error
-        // Dup des to avoid it
-        for (auto& info : mInfo.second) {
-            auto& buffer = info.executeBuffer;
-            for (const auto& infoP : buffer.command) {
-                auto& info = *infoP;
-                for (auto t : info.workOutputs) {
-                    if (!TensorUtils::getDescribe(t)->isMutable) {
-                        continue;
-                    }
-                    auto des = TensorUtils::getDescribe(t);
-                    auto usage = des->usage;
-                    if (TensorUtils::getDescribeOrigin(t)->mContent.use_count() > 1 && usage != Tensor::InsideDescribe::CONSTANT) {
-                        TensorUtils::getDescribeOrigin(t)->mem = nullptr;
-                        auto res = TensorUtils::getDescribeOrigin(t)->getBackend()->onAcquireBuffer(t, Backend::STATIC);
-                        if (!res) {
-                            return OUT_OF_MEMORY;
-                        }
-                    }
-                }
-            }
-        }
         if (OpCommonUtils::supportDynamicInputMemory(mInfo.first.cache.first->type()) && (!mInfo.first.inputBackendChange)) {
             return NO_ERROR;
         }
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index 6fb9543d3..c3611fe59 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -81,6 +81,7 @@ class Pipeline : public NonCopyable {
 #ifndef MNN_BUILD_MINI
     GeometryComputer::Context mContext;
     Runtime::CompilerType mUseGeometry;
+    bool mGeometryNeedRelease = true;
 #endif
     const Runtime* mRuntime;
     const Runtime* mCpuRuntime;
diff --git a/source/core/Schedule.cpp b/source/core/Schedule.cpp
index 475708567..63065596d 100644
--- a/source/core/Schedule.cpp
+++ b/source/core/Schedule.cpp
@@ -142,9 +142,13 @@ MNNForwardType Schedule::getApprociateType(const ScheduleConfig& config) {
                 Backend::Info info;
                 info.type = type;
                 std::shared_ptr<Runtime> bn(creator->onCreate(info));
-                bool isSupportLowPower = bn->onGetRuntimeStatus(RuntimeStatus::STATUS_SUPPORT_POWER_LOW);
-                if(!isSupportLowPower) {
-                    MNN_PRINT("type=%d backend don't Support Low Power, use %d instead\n", type, config.backupType);
+                if (nullptr != bn.get()) {
+                    bool isSupportLowPower = bn->onGetRuntimeStatus(RuntimeStatus::STATUS_SUPPORT_POWER_LOW);
+                    if(!isSupportLowPower) {
+                        MNN_PRINT("type=%d backend don't Support Low Power, use %d instead\n", type, config.backupType);
+                        type = config.backupType;
+                    }
+                } else{
                     type = config.backupType;
                 }
             }
diff --git a/source/core/Schedule.hpp b/source/core/Schedule.hpp
index a13b213f5..1065cfd82 100644
--- a/source/core/Schedule.hpp
+++ b/source/core/Schedule.hpp
@@ -78,6 +78,9 @@ class MNN_PUBLIC Schedule {
         
         std::map<const Op*, std::shared_ptr<Execution>> executionCache;
         OpResizeCache computeCache;
+        
+        /** For CONSTANT info, can release indexes after resize*/
+        std::vector<int> releaseAbleInputs;
     };
 
     // Backend, Tensor, shape-dirty, content-dirty
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 9b27d5e1f..a424898ba 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -87,10 +87,24 @@ void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) {
         case Interpreter::KVCACHE_QUANT_OPTIONS:
             runtimeHint.kvcacheQuantOption = hint;
             break;
+        case Interpreter::KVCACHE_SIZE_LIMIT:
+            runtimeHint.kvcacheSizeLimit = hint;
+            break;
         default:
             break;
     }
 }
+
+void Session::ModeGroup::setExternalPath(std::string path, int type) {
+    switch (type) {
+        case MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR:
+            runtimeHint.kvcacheDirPath = path;
+            break;
+        default:
+            break;
+    }
+}
+
 Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime) {
     mMode = mode;
     mRuntime = std::move(runtime);
@@ -251,18 +265,10 @@ ErrorCode Session::resize() {
             }
         }
         if(mMemoryUsageMode == Interpreter::Session_Memory_Collect) {
-            #ifdef LOG_VERBOSE
-            float memory = 0.0f;
-            #endif
+            mRuntime.second->onGabageCollect(0);
             for (auto& iter : mRuntime.first) {
                 iter.second->onGabageCollect(0);
-                #ifdef LOG_VERBOSE
-                memory += iter.second->onGetMemoryInMB();
-                #endif
             }
-            #ifdef LOG_VERBOSE
-            FUNC_PRINT_ALL(memory, f);
-            #endif
         }
         mNeedMalloc = false;
         mNeedResize = false;
@@ -428,13 +434,14 @@ ErrorCode Session::updateToModel(Net* net) const {
 
 static void initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const std::vector<std::shared_ptr<Tensor>>& tensorSrc) {
     for (int i=0; i<tensors.size(); ++i) {
+        if (tensorSrc[i].get() == nullptr) {
+            continue;
+        }
         // Init all tensor except for const
         if (tensors[i].get() == nullptr) {
             tensors[i].reset(new Tensor);
             TensorUtils::getDescribe(tensors[i].get())->index = i;
         }
-    }
-    for (int i = 0; i < tensors.size(); ++i) {
         auto srcDes = TensorUtils::getDescribe(tensorSrc[i].get());
         if (srcDes->quantAttr != nullptr) {
             TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr);
diff --git a/source/core/Session.hpp b/source/core/Session.hpp
index c753a6c51..8f7415ebd 100644
--- a/source/core/Session.hpp
+++ b/source/core/Session.hpp
@@ -39,6 +39,7 @@ class MNN_PUBLIC Session {
         RuntimeHint runtimeHint;
         void setHint(Interpreter::HintMode hint, int magic);
         void setMode(Interpreter::SessionMode mode);
+        void setExternalPath(std::string path, int type);
     };
     Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode,
             RuntimeInfo&& runtime);
diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp
index f826e4d99..f85a49a6f 100644
--- a/source/geometry/GeometryComputer.hpp
+++ b/source/geometry/GeometryComputer.hpp
@@ -45,6 +45,7 @@ class GeometryComputer {
             return mMask & option;
         }
         std::shared_ptr<BufferStorage> mRasterOp;
+        bool mNeedRelease = true;
     private:
         void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd);
         std::map<const Op*, std::vector<std::shared_ptr<Tensor>>> mConstTensors;
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index fc76622ab..207d29e5d 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -150,6 +150,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
     bool openCache = geoContext.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE);
     /** Size Compute and compute Const Begin */
     GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, backupBackend);
+    bool needRelease = geoContext.mNeedRelease;
     // Size Compute and compute Const
     for (int i=0; i<infos.size(); ++i) {
         auto& info = infos[i];
@@ -330,6 +331,15 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 }
             }
         }
+        if (needRelease) {
+            cmdBufferVir.command.clear();
+            cmdBufferVir.extras.clear();
+            
+            ctx.clear();
+            for (auto index : info.releaseAbleInputs) {
+                TensorUtils::getDescribeOrigin(info.inputs[index])->mem = nullptr;
+            }
+        }
     }
 
     /** Size Compute and compute Const End */
diff --git a/source/geometry/GeometryConv2DBackPropFilter.cpp b/source/geometry/GeometryConv2DBackPropFilter.cpp
index f542da4ea..3277442d7 100644
--- a/source/geometry/GeometryConv2DBackPropFilter.cpp
+++ b/source/geometry/GeometryConv2DBackPropFilter.cpp
@@ -76,7 +76,7 @@ class GeometryConv2DBackPropFilter : public GeometryComputer {
                     endDx = endDx - (endSx - iw + sw) / sw;
                     endSx = endDx * sw + kx * dw - pads.first;
                 }
-                if (startDy > endDy) {
+                if (startDy > endDy || startDx > endDx) {
                     continue;
                 }
                 auto dstOffsetKx = dstOffsetKy + startDx;
diff --git a/source/geometry/GeometryReverseSequence.cpp b/source/geometry/GeometryReverseSequence.cpp
index f63a6e7e9..addcc0339 100644
--- a/source/geometry/GeometryReverseSequence.cpp
+++ b/source/geometry/GeometryReverseSequence.cpp
@@ -26,7 +26,7 @@ class GeometryReverseSequence : public GeometryComputer {
             MNN_ERROR("Dont's has Parameters for OpType_ReverseSequence\n");
             return false;
         }
-        auto seqDim = op->main_as_ReverseSequenceParam()->seqDim();
+        auto seqDim = op->main_as_ReverseSequenceParam()->seqDim(); // time_axis for ONNX
         if (seqDim < 0) {
             seqDim += inputs[0]->dimensions();
         }
@@ -82,7 +82,7 @@ class GeometryReverseSequence : public GeometryComputer {
         outputDes->regions.clear();
 
         for (int batch = 0; batch < batchSize; ++batch) {
-            auto q = reverse->host<int32_t>()[batch];
+            int q = reverse->host<int32_t>()[batch];
             if (q > input->length(seqDim) || q < 1) {
                 MNN_ERROR("ReverseSequence info error\n");
                 return false;
diff --git a/source/shape/SizeComputer.cpp b/source/shape/SizeComputer.cpp
index 3783aba65..61f449fcf 100644
--- a/source/shape/SizeComputer.cpp
+++ b/source/shape/SizeComputer.cpp
@@ -208,7 +208,7 @@ std::vector<int> SizeComputer::needInputContent(const MNN::Op* op, int inputSize
                 return std::vector<int>{ inputSize - 1 };
             }
         }
-        if (inputSize > 1 && (op->type() == OpType_Squeeze || op->type() == OpType_Unsqueeze)) {
+        if (inputSize > 1 && (op->type() == OpType_Squeeze || op->type() == OpType_Unsqueeze || op->type() == OpType_ReverseSequence || op->type() == OpType_Reverse)) {
             return std::vector<int>{1};
         }
         if (op->type() == OpType_CumSum) {
diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp
index 534812f96..2c196851c 100644
--- a/source/utils/InitNet.cpp
+++ b/source/utils/InitNet.cpp
@@ -111,33 +111,49 @@ bool initConstTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net*
     return valid;
 }
 
-bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net) {
+static void _createTensor(std::shared_ptr<Tensor>& dst, int index) {
+    if (dst.get() == nullptr) {
+        dst.reset(new Tensor);
+        TensorUtils::getDescribe(dst.get())->index = index;
+    }
+}
+bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net, const int* oplists, size_t opListSize) {
     bool valid    = true;
     auto describes = net->extraTensorDescribe();
-    std::vector<const TensorDescribe*> des(tensors.size());
-    for (int i=0; i<tensors.size(); ++i) {
-        // Init all tensor except for const
-        if (tensors[i].get() == nullptr) {
-            tensors[i].reset(new Tensor);
-            TensorUtils::getDescribe(tensors[i].get())->index = i;
-            // MNN_PRINT("initTensors create tensor:%p, index:%d, backend:%d\n", tensors[i].get(), i, TensorUtils::getDescribe(tensors[i].get())->backend);
+    if (nullptr != oplists) {
+        for (int i=0; i<opListSize; ++i) {
+            auto op = net->oplists()->GetAs<Op>(oplists[i]);
+            if (nullptr != op->inputIndexes()) {
+                for (int v=0; v<op->inputIndexes()->size(); ++v) {
+                    auto index = op->inputIndexes()->data()[v];
+                    _createTensor(tensors[index], index);
+                }
+            }
+            if (nullptr != op->outputIndexes()) {
+                for (int v=0; v<op->outputIndexes()->size(); ++v) {
+                    auto index = op->outputIndexes()->data()[v];
+                    _createTensor(tensors[index], index);
+                }
+            }
+        }
+    } else {
+        for (int i=0; i<tensors.size(); ++i) {
+            // Init all tensor except for const
+            _createTensor(tensors[i], i);
         }
     }
     if (describes) {
         for (int i = 0; i < describes->size(); i++) {
-            int index  = describes->GetAs<TensorDescribe>(i)->index();
-            des[index] = describes->GetAs<TensorDescribe>(i);
-        }
-    }
-    for (int i = 0; i < tensors.size(); ++i) {
-        if (des[i] != nullptr && des[i]->quantInfo()) {
-            TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr);
-            auto quant   = TensorUtils::getDescribe(tensors[i].get())->quantAttr.get();
-            quant->scale =  des[i]->quantInfo()->scale();
-            quant->zero  =  des[i]->quantInfo()->zero();
-            quant->min   =  des[i]->quantInfo()->min();
-            quant->max   =  des[i]->quantInfo()->max();
-            // Don't copy datatype, it can be set by backend
+            auto des = describes->GetAs<TensorDescribe>(i);
+            int index = des->index();
+            if (tensors[index].get() != nullptr && des->quantInfo()) {
+                TensorUtils::getDescribe(tensors[index].get())->quantAttr.reset(new QuantAttr);
+                auto quant   = TensorUtils::getDescribe(tensors[index].get())->quantAttr.get();
+                quant->scale =  des->quantInfo()->scale();
+                quant->zero  =  des->quantInfo()->zero();
+                quant->min   =  des->quantInfo()->min();
+                quant->max   =  des->quantInfo()->max();
+            }
         }
     }
     // Set Input Tensor, if the type of input is not the same with ExtraTensorDescribe, use input parameter
@@ -147,6 +163,9 @@ bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net)
             MNN_ASSERT(nullptr != op->outputIndexes());
             MNN_ASSERT(op->outputIndexes()->size() == 1);
             auto index      = op->outputIndexes()->data()[0];
+            if (tensors[index].get() == nullptr) {
+                continue;
+            }
             auto tensor     = tensors[index].get();
             auto& tb        = tensor->buffer();
             auto inputParam = op->main_as_Input();
@@ -175,17 +194,16 @@ bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net)
         return valid;
     }
     // static model will set all tensors' shape
-    for (int i = 0; i < describes->size(); i++) {
-        int index  = describes->GetAs<TensorDescribe>(i)->index();
-        des[index] = describes->GetAs<TensorDescribe>(i);
-    }
-    for (int i = 0; i < tensors.size(); ++i) {
-        if (TensorUtils::getDescribe(tensors[i].get())->usage != Tensor::InsideDescribe::NORMAL) {
+    for (int v = 0; v < describes->size(); v++) {
+        auto des = describes->GetAs<TensorDescribe>(v);
+        int index = des->index();
+        auto tensorDes = TensorUtils::getDescribe(tensors[index].get());
+        if (tensorDes->usage != Tensor::InsideDescribe::NORMAL) {
             // Const / Trainable Shape has been inited
             continue;
         }
-        auto blob = des[i]->blob();
-        auto& tb = tensors[i]->buffer();
+        auto blob = des->blob();
+        auto& tb = tensors[index]->buffer();
         if (auto idims = blob->dims()) {
             for (int d = 0; d < idims->size(); d++) {
                 tb.dim[d].extent = idims->Get(d);
@@ -194,14 +212,12 @@ bool initTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net)
         } else {
             tb.dimensions = 0;
         }
-        tensors[i]->setType(blob->dataType());
-    }
-    for (int i = 0; i < tensors.size(); ++i) {
-        auto blob                                                   = des[i]->blob();
-        TensorUtils::getDescribe(tensors[i].get())->dimensionFormat = blob->dataFormat();
-        if (auto regions = des[i]->regions()) {
-            auto& regs = TensorUtils::getDescribe(tensors[i].get())->regions;
-            TensorUtils::getDescribe(tensors[i].get())->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND;
+        tensors[index]->setType(blob->dataType());
+        tensorDes->dimensionFormat = blob->dataFormat();
+        if (auto regions = des->regions()) {
+            auto& regs = tensorDes->regions;
+            tensorDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND;
+            regs.clear();
             regs.reserve(regions->size());
             for (int r = 0; r < regions->size(); r++) {
                 auto region = regions->GetAs<Region>(r);
diff --git a/source/utils/InitNet.hpp b/source/utils/InitNet.hpp
index 6bdcc34c0..e045ae0b5 100644
--- a/source/utils/InitNet.hpp
+++ b/source/utils/InitNet.hpp
@@ -18,7 +18,7 @@ MNN_PUBLIC bool computeShapeForBlob(const Blob* parameter, Tensor* output);
 
 MNN_PUBLIC bool initConstTensors(std::vector<std::shared_ptr<Tensor>>& tensors, const Net* net, Backend* defaultBackend, ErrorCode& code, FileLoader* external);
 // init Tensors by net
-MNN_PUBLIC bool initTensors(std::vector<std::shared_ptr<Tensor>>& allTensors, const Net* net);
+MNN_PUBLIC bool initTensors(std::vector<std::shared_ptr<Tensor>>& allTensors, const Net* net, const int* oplists = nullptr, size_t opListSize = 0);
 // init Pipeline Infos by oplist and tensors
 MNN_PUBLIC void initPipelineInfosFromOps(std::vector<Schedule::OpCacheInfo>& infos, std::vector<const Op*>& ops, const std::vector<std::shared_ptr<Tensor>>& allTensors);
 // set input and output for allTensors by ops info
diff --git a/test.sh b/test.sh
index ef4edd95b..81ef7c647 100755
--- a/test.sh
+++ b/test.sh
@@ -77,8 +77,8 @@ doc_check() {
     # 1.2 check executable
     for executable in $executables
     do
-        if [ $(grep -c $executable ./docs/compile/tools.md) -le 0 ]; then
-            echo 'DOC CHECK FAILED:' $executable 'not in ./docs/compile/tools.md'
+        if [ $(grep -c $executable ./docs/compile/other.md) -le 0 ]; then
+            echo 'DOC CHECK FAILED:' $executable 'not in ./docs/compile/other.md'
             failed
         fi
     done
@@ -117,6 +117,7 @@ doc_check() {
 }
 
 py_check() {
+    echo 'py_check'
     if [ -z "$PY_CHANGE" ]; then
         return
     fi
@@ -133,6 +134,7 @@ py_check() {
 }
 
 static_check() {
+    echo 'static_check'
     if [ -z "$SOURCE_CHANGE" ]; then
         return
     fi
@@ -310,18 +312,6 @@ onnx_convert_test() {
         echo '### ONNXConvert测试失败，测试终止！'
         failed
     fi
-    if [ -f ~/AliNNModel/TestOnnx/ops/run.py ]; then
-        ~/AliNNModel/TestOnnx/ops/run.py --mnndir $(pwd) --aone-mode
-        if [ $? -ne 0 ]; then
-            echo '### Onnx单线程单元测试失败，测试终止！'
-            failed
-        fi
-        #~/AliNNModel/TestOnnx/ops/run.py --mnndir $(pwd) --aone-mode --thread_num 2
-        #if [ $? -ne 0 ]; then
-        #    echo '### ONNX多线程单元测试失败，测试终止！'
-        #    failed
-        #fi
-    fi
 }
 
 tf_convert_test() {
@@ -525,7 +515,7 @@ android_model_test() {
     pass_num=0
     fail_cl_num=0
     pass_cl_num=0
-    models=`ls ~/AliNNModel/OpTestResource/`
+    models=`adb shell ls /data/local/tmp/AliNNModel/OpTestResource/`
     for model in $models
     do
         adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/OpTestResource/$model/temp.bin ../AliNNModel/OpTestResource/$model/input_0.txt ../AliNNModel/OpTestResource/$model/output_0.txt 0 0.002"
@@ -544,7 +534,7 @@ android_model_test() {
         fi
     done
 
-    models=`ls ~/AliNNModel/TestResource/`
+    models=`adb shell ls /data/local/tmp/AliNNModel/TestResource/`
     for model in $models
     do
         if [ $model == 'mobilenetv1quan' ]; then
@@ -560,9 +550,9 @@ android_model_test() {
         if [ "$OPENCL_CHANGE" ]; then
         if [ $model == 'mobilenetv1quan' ]; then
             adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/TestResource/$model/temp.bin ../AliNNModel/TestResource/$model/input_0.txt ../AliNNModel/TestResource/$model/output.txt 3 0.1 1"
-        else 
+        else
             adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/TestResource/$model/temp.bin ../AliNNModel/TestResource/$model/input_0.txt ../AliNNModel/TestResource/$model/output.txt 3 0.002 1"
-        fi    
+        fi
             if [ $? -ne 0 ]; then
                 fail_cl_num=$[$fail_cl_num+1]
             else
@@ -571,7 +561,7 @@ android_model_test() {
         fi
     done
 
-    models=`ls ~/AliNNModel/TestWithDescribe/`
+    models=`adb shell ls /data/local/tmp/AliNNModel/TestWithDescribe/`
     for model in $models
     do
         adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModelWithDescribe.out ../AliNNModel/TestWithDescribe/$model/temp.bin ../AliNNModel/TestWithDescribe/$model/config.txt 0 0.002"
@@ -703,6 +693,11 @@ case "$1" in
         android_static_build
         android_test
         ;;
+    static)
+        doc_check
+        static_check
+        py_check
+        ;;
     *)
         $1
         echo $"Usage: $0 {local|linux|android|func}"
diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp
index 2b3aeb52a..f37f1c038 100644
--- a/test/MNNTestSuite.cpp
+++ b/test/MNNTestSuite.cpp
@@ -33,6 +33,7 @@ void MNNTestSuite::add(MNNTestCase* test, const char* name) {
 static void printTestResult(int wrong, int right, const char* flag) {
     MNN_PRINT("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag);
     MNN_PRINT("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right);
+    MNN_PRINT("TEST_CASE={\"name\":\"单元测试%s\",\"failed\":%d,\"passed\":%d}\n", flag, wrong, right);
 }
 
 int MNNTestSuite::run(const char* key, int precision, const char* flag) {
diff --git a/test/core/FileUtilsTest.cpp b/test/core/FileUtilsTest.cpp
new file mode 100644
index 000000000..5a40bdce4
--- /dev/null
+++ b/test/core/FileUtilsTest.cpp
@@ -0,0 +1,320 @@
+//
+//  FileUtilsTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2024/07/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "MNNTestSuite.h"
+#include "core/MNNFileUtils.h"
+
+#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER)
+const char * file_path = "C:\\Windows\\Temp\\file_utils_test_temp_file";
+#elif defined(__ANDROID__)
+const char * file_path = "/data/local/tmp/file_utils_test_temp_file";
+#elif defined(__APPLE__) || defined(__linux__) || defined(__unix__)
+const char * file_path = "/tmp/file_utils_test_temp_file";
+#else
+const char * file_path = "./file_utils_test_temp_file";
+#endif
+
+class FileUtilsTest : public MNNTestCase {
+public:
+    virtual ~FileUtilsTest() = default;
+    virtual bool run(int precision) {
+        /*======== Create and Remove ========*/
+        do {
+            // create a new file
+            file_t file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            } else {
+                MNNCloseFile(file);
+            }
+            bool exist = MNNFileExist(file_path);
+            if (!exist) {
+                return false;
+            }
+            // create a new file to cover an existing file
+            file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            } else {
+                MNNCloseFile(file);
+            }
+            exist = MNNFileExist(file_path);
+            if (!exist) {
+                return false;
+            }
+            // remove a file
+            MNN::ErrorCode ec = MNNRemoveFile(file_path);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            exist = MNNFileExist(file_path);
+            if (exist) {
+                return false;
+            }
+            printf("File Utils Test: Create and Remove passed\n");
+        } while(false);
+
+        /*======== Open and Close ========*/
+        do {
+            // Open and close a non-existent file
+            file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE);
+            if (file != INVALID_FILE) {
+                return false;
+            }
+            MNN::ErrorCode ec = MNNCloseFile(file);
+            if (ec != FILE_NOT_EXIST) {
+                return false;
+            }
+            // Open and close an existent file
+            file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            }
+            bool exist = MNNFileExist(file_path);
+            if (!exist) {
+                return false;
+            }
+            ec = MNNCloseFile(file);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE);
+            if (file == INVALID_FILE) {
+                return false;
+            }
+            ec = MNNCloseFile(file);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ec = MNNRemoveFile(file_path);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            exist = MNNFileExist(file_path);
+            if (exist) {
+                return false;
+            }
+            printf("File Utils Test: Open and Close passed\n");
+        } while(false);
+
+        /*======== Get and Set File Size ========*/
+        do {
+            file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE);
+            if (file != INVALID_FILE) {
+                return false;
+            }
+            size_t size = MNNGetFileSize(file);
+            if (size != INVALID_SIZE) {
+                printf("File size mismatch: expected %lu but got %lu\n", INVALID_SIZE, size);
+                return false;
+            }
+            file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            }
+            size = MNNGetFileSize(file);
+            if (size != 0) {
+                printf("File size mismatch: expected 0 but got %lu\n", size);
+                return false;
+            }
+            size_t expectedSize = 1023;
+            MNN::ErrorCode ec = MNNSetFileSize(file, expectedSize);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            size = MNNGetFileSize(file);
+            if (size != expectedSize) {
+                printf("File size mismatch: expected %lu but got %lu\n", expectedSize, size);
+                return false;
+            }
+            expectedSize = 64 * 1024 * 1024;
+            ec = MNNSetFileSize(file, expectedSize);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            size = MNNGetFileSize(file);
+            if (size != expectedSize) {
+                printf("File size mismatch: expected %lu but got %lu\n", expectedSize, size);
+                return false;
+            }
+            ec = MNNCloseFile(file);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ec = MNNRemoveFile(file_path);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            bool exist = MNNFileExist(file_path);
+            if (exist) {
+                return false;
+            }
+            printf("File Utils Test: Get and Set File Size passed\n");
+        } while(false);
+
+        /*======== Read and Write ========*/
+        do {
+            char alpha[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+            size_t size = 32;
+            char * buf = (char *)malloc(size);
+            if (buf == nullptr) {
+                printf("MNN_FAILED to allocate memory in File Utils Test!\n");
+                return false;
+            }
+            file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE);
+            if (file != INVALID_FILE) {
+                return false;
+            }
+            size_t ret = MNNReadFile(file, nullptr, 0);
+            if (ret != 0) {
+                return false;
+            }
+            ret = MNNWriteFile(file, nullptr, 0);
+            if (ret != 0) {
+                return false;
+            }
+            file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 10);
+            if (ret != 0) {
+                return false;
+            }
+            ret = MNNWriteFile(file, alpha, 26);
+            if (ret != 26) {
+                return false;
+            }
+            MNN::ErrorCode ec = MNNSetFilePointer(file, 0);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 20);
+            if (ret != 20) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 10);
+            if (ret != 6) {
+                return false;
+            }
+            ec = MNNSetFilePointer(file, 0);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 3);
+            if (ret != 3) {
+                return false;
+            }
+            if (buf[0] != 'A' || buf[1] != 'B' || buf[2] != 'C') {
+                return false;
+            }
+            ec = MNNSetFilePointer(file, 20);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 3);
+            if (ret != 3) {
+                return false;
+            }
+            if (buf[0] != 'U' || buf[1] != 'V' || buf[2] != 'W') {
+                return false;
+            }
+            ec = MNNSetFilePointer(file, 10);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            char hello[6] = "hello";
+            ret = MNNWriteFile(file, (void *)hello, 6);
+            if (ret != 6) {
+                return false;
+            }
+            ec = MNNSetFilePointer(file, 10);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ret = MNNReadFile(file, buf, 6);
+            if (0 != strcmp(buf, "hello")) {
+                return false;
+            }
+            ec = MNNCloseFile(file);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ec = MNNRemoveFile(file_path);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            bool exist = MNNFileExist(file_path);
+            if (exist) {
+                return false;
+            }
+            free(buf);
+            printf("File Utils Test: Read and Write passed\n");
+        } while(false);
+
+        /*======== Map and Unmap ========*/
+        do {
+            char * addr = (char *)MNNMmapFile(INVALID_FILE, INVALID_SIZE);
+            if (addr != nullptr) {
+                return false;
+            }
+            MNN::ErrorCode ec = MNNUnmapFile(addr, 0);
+            if (ec != FILE_UNMAP_FAILED) {
+                return false;
+            }
+            file_t file = MNNCreateFile(file_path);
+            if (file == INVALID_FILE) {
+                return false;
+            }
+            addr = (char *)MNNMmapFile(file, 1024);
+            if (addr != nullptr) {
+                return false;
+            }
+            ec = MNNSetFileSize(file, 1024);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            addr = (char *)MNNMmapFile(file, 1024);
+            if (addr == nullptr) {
+                return false;
+            }
+            strcpy(addr, "hello");
+            ec = MNNUnmapFile(addr, 1024);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            addr = (char *)MNNMmapFile(file, 1024);
+            if (addr == nullptr) {
+                return false;
+            }
+            if(0 != strcmp(addr, "hello")) {
+                return false;
+            }
+            ec = MNNUnmapFile(addr, 1024);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ec = MNNCloseFile(file);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            ec = MNNRemoveFile(file_path);
+            if (ec != NO_ERROR) {
+                return false;
+            }
+            bool exist = MNNFileExist(file_path);
+            if (exist) {
+                return false;
+            }
+            printf("File Utils Test: Map and Unmap passed\n");
+        } while(false);
+
+        return true;
+    }
+};
+MNNTestSuiteRegister(FileUtilsTest, "core/file_utils");
diff --git a/test/core/IDSTTest.cpp b/test/core/IDSTTest.cpp
index 62e620d60..bd4cf136b 100644
--- a/test/core/IDSTTest.cpp
+++ b/test/core/IDSTTest.cpp
@@ -22,14 +22,18 @@ class IDSTTest : public MNNTestCase {
         std::vector<int8_t> quantWeight(kernelNum * kernelSize, 0);
         // IDST encode
         std::unique_ptr<IDSTQuanT> idstQuantT = IDSTEncoder::encode(weight.data(), scale, kernelSize, kernelNum, false, quantWeight.data(), -127);
-        std::unique_ptr<Convolution2DT> conv2dT(new Convolution2DT);
+        Convolution2DT* conv2dT = new Convolution2DT;
+        std::unique_ptr<OpT> opT(new OpT);
         conv2dT->quanParameter = std::move(idstQuantT);
+        opT->type = OpType_Convolution;
+        opT->main.type = OpParameter_Convolution2D;
+        opT->main.value = conv2dT;
         flatbuffers::FlatBufferBuilder builder;
-        auto lastOffset = Convolution2D::Pack(builder, conv2dT.get());
+        auto lastOffset = Op::Pack(builder, opT.get());
         builder.Finish(lastOffset);
-        auto conv2d = flatbuffers::GetRoot<Convolution2D>(builder.GetBufferPointer());
+        auto op = flatbuffers::GetRoot<Op>(builder.GetBufferPointer());
         // IDST decode
-        std::shared_ptr<ConvolutionCommon::Int8Common> common = ConvolutionCommon::load(conv2d);
+        std::shared_ptr<ConvolutionCommon::Int8Common> common = ConvolutionCommon::load(op);
         // is input == output ?
         bool res = (0 == memcmp(common->weightFloat.get(), weight.data(), weight.size()));
         return res;
diff --git a/test/expr/MemoryIncrease.cpp b/test/expr/MemoryIncrease.cpp
index 90f089261..11f7fbc54 100644
--- a/test/expr/MemoryIncrease.cpp
+++ b/test/expr/MemoryIncrease.cpp
@@ -8,6 +8,8 @@
 
 #include <MNN/Interpreter.hpp>
 #include <MNN/expr/ExprCreator.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+#include <MNN/expr/Module.hpp>
 #include <thread>
 #include "MNNTestSuite.h"
 #include "MNN_generated.h"
@@ -205,3 +207,76 @@ class MidOutputTest : public MNNTestCase {
     }
 };
 MNNTestSuiteRegister(MidOutputTest, "expr/MidOutputTest");
+
+class ConstFoldMemoryTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        BackendConfig bnConfig;
+        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
+        ExecutorScope scope(exe);
+        Module::Config config;
+        config.shapeMutable = true;
+        config.rearrange = true;
+        std::vector<int8_t> buffer;
+        {
+            // Make Buffer
+            auto x0 = _Input({1}, NCHW, halide_type_of<float>());
+            x0->setName("x0");
+            auto x1 = _Const(1.0f, {256, 1024}, NCHW);
+            x1 = x1 * x1 * _Cos(x1) * _Sin(x1);
+            auto y0 = x0 * x1;
+            y0->setName("y0");
+            buffer = Variable::save({y0});
+        }
+        auto rtInfo = Express::ExecutorScope::Current()->getRuntime();
+        auto rt = rtInfo.first.begin()->second;
+        MNN::ScheduleConfig sconfig;
+        std::vector<MNN::ScheduleConfig> sconfigs = {sconfig};
+        std::shared_ptr<Executor::RuntimeManager> rtMgr(Executor::RuntimeManager::createRuntimeManager(sconfigs));
+        rtMgr->setMode(Interpreter::Session_Memory_Collect);
+        std::shared_ptr<MNN::Express::Module> m0(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        std::shared_ptr<MNN::Express::Module> m1(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        float memoryInit = 0.0f;
+        rtMgr->getInfo(Interpreter::MEMORY, &memoryInit);
+        FUNC_PRINT_ALL(memoryInit, f);
+        auto x = _Input({1}, NCHW, halide_type_of<float>());
+        x->writeMap<float>();
+        x->unMap();
+        float memoryCurrent = 0.0f;
+        auto compute = [&](){
+            m0->onForward({x});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto static0 = memoryCurrent - memoryInit;
+            FUNC_PRINT_ALL(static0, f);
+            if (static0 > 2.1f) {
+                MNN_ERROR("Constant folder Memory too large\n");
+                return false;
+            }
+            memoryInit = memoryCurrent;
+            m1->traceOrOptimize(Interpreter::Session_Resize_Check);
+            m1->onForward({x});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto static1 = memoryCurrent - memoryInit;
+            FUNC_PRINT_ALL(static1, f);
+            if (static1 <= static0) {
+                MNN_ERROR("Check mod the memory should be larger than init mode\n");
+                return false;
+            }
+            m1->traceOrOptimize(Interpreter::Session_Resize_Fix);
+            m1->onForward({x});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto static2 = memoryCurrent - memoryInit;
+            FUNC_PRINT_ALL(static2, f);
+            if (static2 >= static1) {
+                MNN_ERROR("TODO: Fix mod the memory should be less than check mode\n");
+            }
+            return true;
+        };
+        bool res = compute();
+        if (!res) {
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(ConstFoldMemoryTest, "expr/ConstFoldMemoryTest");
diff --git a/test/expr/ModuleShapeInfer.cpp b/test/expr/ModuleShapeInfer.cpp
new file mode 100644
index 000000000..5d122c32e
--- /dev/null
+++ b/test/expr/ModuleShapeInfer.cpp
@@ -0,0 +1,108 @@
+#include <math.h>
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/expr/Module.hpp>
+#include "MNNTestSuite.h"
+using namespace MNN;
+using namespace MNN::Express;
+
+class ModuleShapeInfer : public MNNTestCase {
+public:
+    static float _reduceSum(const float* zPtr, int size) {
+        float summer = 0.0f;
+        for (int i=0; i<size; ++i) {
+            summer+=zPtr[i];
+        }
+        return summer;
+    }
+    virtual bool run(int precision) {
+        std::vector<VARP> empty;
+        // Make Net
+        auto x = _Input({1, 3, 2, 2}, NCHW, halide_type_of<float>());
+        x->setName("x");
+        auto y = x * x;
+        VARP starts;
+        VARP sizes;
+        {
+            std::vector<int> sta = {0, 0, 1, 1};
+            std::vector<int> siz = {1, 1, 1, 1};
+            starts = _Const(sta.data(), {4}, NCHW, halide_type_of<int>());
+            sizes = _Const(siz.data(), {4}, NCHW, halide_type_of<int>());
+        }
+        auto z = _Slice(y, starts, sizes);
+        z->setName("z");
+        auto buffer = Variable::save({z});
+        ScheduleConfig config;
+        BackendConfig bnConfig;
+        bnConfig.precision = MNN::BackendConfig::Precision_Low;
+        config.backendConfig = &bnConfig;
+        std::shared_ptr<Executor::RuntimeManager> rt(Executor::RuntimeManager::createRuntimeManager(config), Executor::RuntimeManager::destroy);
+        std::shared_ptr<Module> net0(Module::load({"x"}, {"z"}, (const uint8_t*)buffer.data(), buffer.size(), rt), Module::destroy);
+        std::shared_ptr<Module> net1(Module::load({"x"}, {"z"}, (const uint8_t*)buffer.data(), buffer.size(), rt), Module::destroy);
+        x = _Input({1, 3, 2, 2}, NCHW, halide_type_of<float>());
+        // Run Init Value
+        auto inputPtr = x->writeMap<float>();
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = i;
+        }
+        y = net0->onForward({x})[0];
+        auto yPtr = y->readMap<float>();
+        auto ySize = y->getInfo()->size;
+        auto valueFirst = _reduceSum(yPtr, ySize);
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = x->getInfo()->size - i;
+        }
+        y = net0->onForward({x})[0];
+        yPtr = y->readMap<float>();
+        auto valueSecond = _reduceSum(yPtr, ySize);
+        
+        // Shape Infer mode
+        auto code = net1->traceOrOptimize(Interpreter::Module_Forward_Separate);
+        if (0 != code) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = i;
+        }
+        y = net1->onForward({x})[0];
+        yPtr = y->readMap<float>();
+        auto tmp = net1->onForward(empty);
+        if (tmp.size() > 0) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        if (_reduceSum(yPtr, ySize) != valueFirst) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = x->getInfo()->size - i;
+        }
+        net1->onForward(empty);
+        if (_reduceSum(yPtr, ySize) != valueSecond) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        net1->traceOrOptimize(MNN::Interpreter::Module_Forward_Combine);
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = i;
+        }
+        y = net1->onForward({x})[0];
+        yPtr = y->readMap<float>();
+        if(_reduceSum(yPtr, ySize) != valueFirst) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        for (int i=0; i<x->getInfo()->size; ++i) {
+            inputPtr[i] = x->getInfo()->size - i;
+        }
+        y = net1->onForward({x})[0];
+        yPtr = y->readMap<float>();
+        if(_reduceSum(yPtr, ySize) != valueSecond) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(ModuleShapeInfer, "expr/ModuleShapeInfer");
diff --git a/test/expr/ReverseSequenceTest.cpp b/test/expr/ReverseSequenceTest.cpp
index e93fef9df..7a9f7c9ff 100644
--- a/test/expr/ReverseSequenceTest.cpp
+++ b/test/expr/ReverseSequenceTest.cpp
@@ -15,6 +15,7 @@ class ReverseSequenceTest : public MNNTestCase {
 public:
     virtual bool run(int precision) {
         // high dimension, batch_dim ahead
+        
         {
             auto y               = _Input({4}, NHWC, halide_type_of<int32_t>());
             std::vector<int> seq = {7, 2, 3, 5};
@@ -59,6 +60,7 @@ class ReverseSequenceTest : public MNNTestCase {
                                 }
 
                                 if (!func_equal(need, compute)) {
+                                    MNN_PRINT("case 1 error\n");
                                     return false;
                                 }
                             }
@@ -66,7 +68,28 @@ class ReverseSequenceTest : public MNNTestCase {
                     }
                 }
             }
-            return true;
+        }
+        
+        {   // test SizeComputer::needInputContent
+            int dim0 = 1, dim1 = 6, dim2 = 7, dim3 = 10, dim4 = 8;
+            auto x    = _Input({dim0, dim1, dim2, dim3, dim4}, NHWC, halide_type_of<float>());
+            auto x_transpose = _Transpose(x, {1, 0, 2, 3, 4});
+            auto x_shape = _Shape(x_transpose, NHWC);
+            int ii[]= {1};
+            auto x_gather = _Gather(x_shape, _Const(ii, {1}, NCHW, halide_type_of<int>()));
+            auto ry    = _ReverseSequence(x_transpose, x_gather, 1, 3);
+            auto xPtr = x->writeMap<float>();
+            
+            for (int i = 0; i < dim0 * dim1 * dim2 * dim3 * dim4; ++i) {
+                xPtr[i] = 1;
+            }
+
+            auto ryPtr = ry->readMap<float>();
+
+            if (ryPtr == nullptr) {
+                MNN_PRINT("case 2 error\n");
+                return false;
+            }
         }
 
         // high dimension, seq_dim ahead
@@ -113,6 +136,7 @@ class ReverseSequenceTest : public MNNTestCase {
                                     need = 10000 * o + 1000 * (req - i - 1) + 100 * m + 10 * j + k;
                                 }
                                 if (!func_equal(need, compute)) {
+                                    MNN_PRINT("case 3 error\n");
                                     return false;
                                 }
                             }
@@ -120,7 +144,6 @@ class ReverseSequenceTest : public MNNTestCase {
                     }
                 }
             }
-            return true;
         }
 
         // 3 dimension
@@ -160,13 +183,14 @@ class ReverseSequenceTest : public MNNTestCase {
                             need = 100 * (req - i - 1) + 10 * j + k;
                         }
                         if (!func_equal(need, compute)) {
+                            MNN_PRINT("case 4 error\n");
                             return false;
                         }
                     }
                 }
             }
-            return true;
         }
+        return true;
     }
 };
 MNNTestSuiteRegister(ReverseSequenceTest, "expr/ReverseSequence");
diff --git a/test/grad/BinaryGradTest.cpp b/test/grad/BinaryGradTest.cpp
index 7ea5cc995..79f84d181 100644
--- a/test/grad/BinaryGradTest.cpp
+++ b/test/grad/BinaryGradTest.cpp
@@ -17,6 +17,9 @@ using namespace MNN::Express;
 
 class BinaryGradTest : public MNNTestCase {
 public:
+    BinaryGradTest() {
+        OpGrad::init();
+    }
     char name[20] = "Binary";
     virtual ~BinaryGradTest() = default;
 
diff --git a/test/grad/GridSampleGradTest.cpp b/test/grad/GridSampleGradTest.cpp
index d5f7afd28..5ff8131ca 100644
--- a/test/grad/GridSampleGradTest.cpp
+++ b/test/grad/GridSampleGradTest.cpp
@@ -134,7 +134,8 @@ class GridSampleGradTest : public MNNTestCase {
                                                 1.9181, 2.3750, 1.2852, 3.8511, 2.2257, 3.3546, 1.7295, 2.3564, 1.4813,
                                                 1.2510, 3.0876, 2.1284, 2.1088, 3.0961, 2.2002, 3.6899, 2.5827, 4.1795,
                                                 2.8591, 1.4046, 1.2500, 3.0877, 3.2670, 3.5806, 2.8717, 2.8829, 1.6387};
-        auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap<float>();
+        auto tmpgotOutput = _Convert(inputGrad[0], NCHW);
+        auto gotOutput = tmpgotOutput->readMap<float>();
 
         for (int i = 0; i < inputLen; ++i) {
             auto diff = ::fabsf(gotOutput[i] - expectedOutput[i]);
diff --git a/test/grad/PReLUGradTest.cpp b/test/grad/PReLUGradTest.cpp
index ee003fba0..1ce712c54 100644
--- a/test/grad/PReLUGradTest.cpp
+++ b/test/grad/PReLUGradTest.cpp
@@ -36,7 +36,8 @@ class PReLUGradTest : public MNNTestCase {
         auto inputGrad = grad->onGrad(opExpr, {_Convert(outputDiffVar, NC4HW4)});
 
         const std::vector<float> expectedOutput = {0.025, -0.1, 0.09, 0.4, 0.05};
-        auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap<float>();
+        auto gotOutputVar = _Convert(inputGrad[0], NCHW);
+        auto gotOutput = gotOutputVar->readMap<float>();
 
         for (int i = 0; i < len; ++i) {
             auto diff = ::fabsf(gotOutput[i] - expectedOutput[i]);
diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp
index 428a37d72..3b9d94856 100644
--- a/test/op/ConvInt8Test.cpp
+++ b/test/op/ConvInt8Test.cpp
@@ -290,6 +290,7 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon {
 public:
 
     virtual bool run(int precision) {
+        return true;
         std::vector<std::vector<int>> kernels = {
             {4, 2}, {1, 5}, {7, 1}
         };
diff --git a/test/op/ResizeTest.cpp b/test/op/ResizeTest.cpp
index cc0380f4b..9a6ac5ef4 100644
--- a/test/op/ResizeTest.cpp
+++ b/test/op/ResizeTest.cpp
@@ -102,7 +102,7 @@ class InterpTest : public MNNTestCase {
                 return false;
             }
         }
-        
+
         //Interp Type:3
         {
             auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false);
@@ -145,7 +145,7 @@ class InterpInt8Test : public MNNTestCase {
         auto scaleVar = _Const((void*)scales, {4}, NCHW);
         int outW = int(wScale * 2);
         int outH = int(hScale * 2);
-        
+
         //Interp Type:1
         {
             printf("InterpInt8 test: Type=1\n");
@@ -190,7 +190,7 @@ class InterpInt8Test : public MNNTestCase {
                 return false;
             }
         }
-        
+
         // Interp Type:3
         {
             printf("InterpInt8 test: Type=3\n");
diff --git a/test/op/ReverseTest.cpp b/test/op/ReverseTest.cpp
index 8220944ef..5dc4ea3de 100644
--- a/test/op/ReverseTest.cpp
+++ b/test/op/ReverseTest.cpp
@@ -120,6 +120,28 @@ class ReverseTest : public MNNTestCase {
                 }
             }
         }
+        
+        {   // test SizeComputer::needInputContent
+            int dim0 = 1, dim1 = 6, dim2 = 7, dim3 = 10, dim4 = 8;
+            auto x    = _Input({dim0, dim1, dim2, dim3, dim4}, NHWC, halide_type_of<float>());
+            auto x_transpose = _Transpose(x, {1, 0, 2, 3, 4});
+            auto x_shape = _Shape(x_transpose, NHWC);
+            int ii[]= {1};
+            auto x_gather = _Gather(x_shape, _Const(ii, {1}, NCHW, halide_type_of<int>()));
+            auto ry    = _Reverse(x_transpose, x_gather);
+            auto xPtr = x->writeMap<float>();
+            
+            for (int i = 0; i < dim0 * dim1 * dim2 * dim3 * dim4; ++i) {
+                xPtr[i] = 1;
+            }
+
+            auto ryPtr = ry->readMap<float>();
+
+            if (ryPtr == nullptr) {
+                MNN_PRINT("reverse case 3 error\n");
+                return false;
+            }
+        }
         return true;
     }
 };
diff --git a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
index 6138f14a0..998f68844 100644
--- a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
+++ b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
@@ -381,7 +381,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
             if (expr->get()->type() != OpType_BinaryOp && expr->get()->type() != OpType_MatMul) {
                 return false;
             }
-            if (expr->get()->type() != OpType_BinaryOp && expr->get()->main_as_BinaryOp() && expr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_ADD) {
+            if (expr->get()->type() == OpType_BinaryOp && expr->get()->main_as_BinaryOp() && expr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_ADD) {
                 return false;
             }
             VARP matmul_var;
@@ -395,6 +395,9 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
                 if (matmul_expr->get() == nullptr) {
                     return false;
                 }
+                if (expr->inputs().size() > 2) {
+                    return false;
+                }
                 if (expr->inputs().size() > 1) {
                     bias_var = expr->inputs().at(1);
                     if (matmul_var->expr().first->get() == nullptr || matmul_var->expr().first->get()->type() == OpType_Const) {
@@ -403,10 +406,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
                         matmul_expr = matmul_var->expr().first;
                     }
                 }
-                if (bias_var->getInfo() == nullptr) {
-                    return false;
-                }
-                if (bias_var->expr().first->inputType() == VARP::InputType::INPUT) {
+                if (matmul_expr->get() == nullptr || matmul_expr->get()->type() != OpType_MatMul ) {
                     return false;
                 }
                 // conv -> reshape -> convert -> add
@@ -424,11 +424,17 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
                 if (matmul_var->linkNumber() > 1) {
                     return false;
                 }
+                if (bias_var->readMap<float>() == nullptr) {
+                    return false;
+                }
             } else {
                 matmul_expr = std::move(expr);
                 if (matmul_expr->inputs().size() != 8 && matmul_expr->inputs().size() != 9) {
                     return false;
                 }
+                if (nullptr == matmul_expr->get() || matmul_expr->get()->type() != OpType_MatMul) {
+                    return false;
+                }
                 matmulAddBias = false;
             } // finish getting matmul_expr
 
@@ -438,9 +444,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
             auto input = matmul_expr->inputs().at(0);
             auto weight = matmul_expr->inputs()[1];
             auto weightInfo = weight->getInfo();
-            if (nullptr == matmulOp || matmulOp->type() != OpType_MatMul) {
-                return false;
-            }
+            
             if (nullptr == weightInfo || weightInfo->dim.size() != 2 || weightInfo->type.bits != 8) {
                 return false;
             }
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
index 56fa934da..54fff31ae 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
@@ -13,7 +13,7 @@ namespace MNN {
 namespace Express {
 
 template<typename T> 
-static EXPRP clipConvert(EXPRP expr) {
+static EXPRP clipConvert(EXPRP expr, bool supportRelu6) {
     auto inputs     = expr->inputs();
     auto op         = expr->get();
     auto extraParam = op->main_as_Extra();
@@ -49,7 +49,7 @@ static EXPRP clipConvert(EXPRP expr) {
             maxValue = maxPtr[0];
         }
     }
-    if (unknown_min_max) {
+    if (unknown_min_max || (!supportRelu6)) {
         auto minVar = _Scalar<T>(minValue);
         auto maxVar = _Scalar<T>(maxValue);
         if (inputs.size() >= 2 && inputs[1].get() != nullptr) {
@@ -84,18 +84,17 @@ class OnnxClipTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
         auto inputs = expr->inputs();
-        halide_type_code_t type;
+        halide_type_code_t type = halide_type_int;
         for (int i = 0; i < inputs.size(); ++i) {
             if (nullptr != inputs[i] && nullptr != inputs[i]->getInfo()) {
                 type = static_cast<halide_type_code_t>(inputs[i]->getInfo()->type.code);
                 break;
             }
         }
-        if (type == halide_type_float) {
-            return clipConvert<float>(expr);
-        } else {
-            return clipConvert<int32_t>(expr);
+        if (type == halide_type_float || inputs.size() == 1) {
+            return clipConvert<float>(expr, true);
         }
+        return clipConvert<int32_t>(expr, false);
     }
 };
 
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
index 6927e246d..d67760d27 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
@@ -31,7 +31,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform {
             return nullptr;
         }
 
-        uint8_t dataType = halide_type_int;
+        auto dataType = halide_type_int;
         VARP zeropoint = _Const(0.f);
         if (inputs.size() > 2) {
             if (inputs[2]->getInfo() == nullptr) {
@@ -39,7 +39,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform {
             }
             MNN_ASSERT(inputs[2]->getInfo() != nullptr);
             auto zeroDim = inputs[2]->getInfo()->dim;
-            dataType = inputs[2]->getInfo()->type.code;
+            dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
             std::vector<float> fp32Zero(inputs[2]->getInfo()->size);
             if (dataType == halide_type_int) {
                 const int8_t* zeroPtr = inputs[2]->readMap<int8_t>();
@@ -60,7 +60,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform {
         std::vector<int32_t> inputDim = {};
         if (input->getInfo()) {
             inputDim = input->getInfo()->dim;
-            dataType = input->getInfo()->type.code;
+            dataType = static_cast<halide_type_code_t>(input->getInfo()->type.code);
         }
         auto offset = _Const(0.f);
         if (dataType == halide_type_uint) {
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp
index ddbdbc657..d0b66d1b9 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp
@@ -139,16 +139,22 @@ class OnnxEinsumTransform : public OnnxExtraManager::Transform {
             }
             // find reduce dim
             char reduce_dim;
+            int reduce_dim_pos = -1;
             for (int i = 0; i < input0.size(); ++i) {
                 auto c = input0[i];
                 if (right.find(c) == std::string::npos) {
                     reduce_dim = c;
+                    reduce_dim_pos = i;
                     break;
                 }
             }
+            bool needTransposeA = false;
+            if (reduce_dim_pos >= 0 && input0.size() >= 2 && reduce_dim_pos == input0.size() - 2) {
+                needTransposeA = true;
+            }
             auto need_transpose = input1.find(reduce_dim) == (input1.size() - 1);
             // matmul: matmul auto broadcast such: `bhwc @ hkc` -> `bhwc @ bhkc`
-            auto output = _MatMul(var0, var1, false, need_transpose);
+            auto output = _MatMul(var0, var1, needTransposeA, need_transpose);
             // squeeze
             if (sqeeze_axis >= 0) {
                 output = _Squeeze(output, {sqeeze_axis});
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
index c94cfee75..fae1ffb0c 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
@@ -31,12 +31,12 @@ class OnnxQuantizeLinearTransform : public OnnxExtraManager::Transform {
             MNN_ERROR("QuantizeLinear should provide scale and input\n");
             return nullptr;
         }
-        uint8_t dataType = halide_type_int;
+        auto dataType = halide_type_int;
         VARP zeropoint = _Const(0.f);
         auto offset = _Const(0.f);
         if (inputs.size() > 2) {
             zeropoint = _Cast<float>(inputs[2]);
-            dataType = inputs[2]->getInfo()->type.code;
+            dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
         }
         if (dataType == halide_type_uint) {
             offset = _Const(128.f);
diff --git a/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp b/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp
new file mode 100644
index 000000000..34e8a864d
--- /dev/null
+++ b/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp
@@ -0,0 +1,52 @@
+//
+//  ConvTranposeTflite.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2019/09/27.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "MNN_generated.h"
+#include "../../tflite/liteOpConverter.hpp"
+#include "TFliteExtraManager.hpp"
+
+namespace MNN {
+namespace Express {
+
+/*See CustomTflite.cpp for detail attribute*/
+class ConvTranposeTflite : public TFliteExtraManager::Transform {
+public:
+    virtual EXPRP onExecute(EXPRP expr) const override {
+        auto inputs = expr->inputs();
+        auto weight = inputs[1];
+        auto bias = inputs[2];
+        weight = _Transpose(weight, {3, 0, 1, 2});
+        auto weightInfo = weight->getInfo();
+        auto biasInfo = bias->getInfo();
+        
+        auto extra = expr->get()->main_as_Extra();
+        std::unique_ptr<MNN::OpT> deconvOp(flatbuffers::GetRoot<MNN::Op>(extra->info()->data())->UnPack());
+        auto weightPtr = weight->readMap<float>();
+        auto biasPtr = bias->readMap<float>();
+        EXPRP newExpr;
+        if (nullptr == weightPtr || nullptr == biasPtr) {
+            newExpr = Expr::create(deconvOp.get(), {inputs[0], weight, bias});
+        } else {
+            auto conv = deconvOp->main.AsConvolution2D();
+            conv->weight.resize(weightInfo->size);
+            ::memcpy(conv->weight.data(), weightPtr, weightInfo->size * sizeof(float));
+            conv->bias.resize(biasInfo->size);
+            ::memcpy(conv->bias.data(), biasPtr, biasInfo->size * sizeof(float));
+            newExpr = Expr::create(deconvOp.get(), {inputs[0]});
+        }
+        auto newOutput = Variable::create(newExpr);
+        newOutput->setName(expr->name());
+        return newOutput->expr().first;
+    }
+};
+static auto gRegister = []() {
+    TFliteExtraManager::get()->insert("Convolution2DTransposeBias", std::shared_ptr<TFliteExtraManager::Transform>(new ConvTranposeTflite));
+    return true;
+}();
+} // namespace Express
+} // namespace MNN
diff --git a/tools/converter/source/tflite/ConvolutionTflite.cpp b/tools/converter/source/tflite/ConvolutionTflite.cpp
index 2b0fa15ef..9e3809f3b 100644
--- a/tools/converter/source/tflite/ConvolutionTflite.cpp
+++ b/tools/converter/source/tflite/ConvolutionTflite.cpp
@@ -40,6 +40,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>
     const auto& inputTensor  = tfliteTensors[inputIndex];
     const auto& weightTensor = tfliteTensors[weightIndex];
     const auto& outputTensor = tfliteTensors[outputIndex];
+    
+    auto inputShape = inputTensor->shape;
+    int group = 1;
     // co kh kw ci
     const auto& weightShape = weightTensor->shape;
     DCHECK(weightShape.size() == 4) << "Conv2D weight ERROR!";
@@ -48,6 +51,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>
     const int kw         = weightShape[2];
     const int ci         = weightShape[3];
     const int weightSize = co * kh * kw * ci;
+    if (inputShape.size() == 4 && inputShape[3] > ci) {
+        group = inputShape[3] / ci;
+    }
     if (quantizedModel == 1) { // UINT8_QUANT
         auto conv2dParamQuan         = new MNN::TfQuantizedConv2DT;
         conv2dParamQuan->modelFormat = MNN::ModeFormat_TFLITE;
@@ -99,7 +105,7 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>
         conv2dParamQuan->common->outputCount = co;
 
         // default
-        conv2dParamQuan->common->group   = 1;
+        conv2dParamQuan->common->group   = group;
         conv2dParamQuan->common->dilateX = tfliteConvOption->dilation_w_factor;
         conv2dParamQuan->common->dilateY = tfliteConvOption->dilation_h_factor;
         conv2dParamQuan->depthMultiplier = 1;
@@ -166,9 +172,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>
             return;
         }
 
-        common->group       = 1;
+        common->group       = group;
         common->outputCount = co;
-        common->inputCount  = ci;
+        common->inputCount  = ci * group;
         common->kernelX     = kw;
         common->kernelY     = kh;
         common->dilateX     = tfliteConvOption->dilation_w_factor;
@@ -242,9 +248,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr<tflite::OperatorT>
             return;
         }
 
-        common->group       = 1;
+        common->group       = group;
         common->outputCount = co;
-        common->inputCount  = ci;
+        common->inputCount  = ci * group;
         common->kernelX     = kw;
         common->kernelY     = kh;
         common->dilateX     = tfliteConvOption->dilation_w_factor;
diff --git a/tools/converter/source/tflite/CustomTflite.cpp b/tools/converter/source/tflite/CustomTflite.cpp
index b311e82fc..66ac9d324 100644
--- a/tools/converter/source/tflite/CustomTflite.cpp
+++ b/tools/converter/source/tflite/CustomTflite.cpp
@@ -20,12 +20,86 @@ MNN::OpType CustomTflite::opType(int quantizedModel) {
 MNN::OpParameter CustomTflite::type(int quantizedModel) {
     return MNN::OpParameter_DetectionPostProcessParam;
 }
+struct TfLiteTransposeConvParams{
+  // Parameters supported by version 1:
+  int padding = 0;
+  int stride_width;
+  int stride_height;
+
+  // Parameters supported by version 4:
+  int activation = 0;
+
+  // Parameters for TransposeConv version 5 or above.
+  // Used to determine the default value for the quantized bias.
+  int quantized_bias_type = 0;
+};
+
 
 void CustomTflite::run(MNN::OpT *dstOp, const std::unique_ptr<tflite::OperatorT> &tfliteOp,
                        const std::vector<std::unique_ptr<tflite::TensorT> > &tfliteTensors,
                        const std::vector<std::unique_ptr<tflite::BufferT> > &tfliteModelBuffer,
                        const std::vector<std::unique_ptr<tflite::OperatorCodeT> > &tfliteOpSet, int quantizedModel) {
     auto &customOPCode = tfliteOpSet[tfliteOp->opcode_index]->custom_code;
+    if (customOPCode == "Convolution2DTransposeBias") {
+        dstOp->type = MNN::OpType_Deconvolution;
+        TfLiteTransposeConvParams params;
+        size_t copyLenth = std::min(sizeof(params), tfliteOp->custom_options.size());
+        ::memcpy(&params, tfliteOp->custom_options.data(), copyLenth);
+        dstOp->main.type = MNN::OpParameter_Convolution2D;
+        dstOp->main.value = new MNN::Convolution2DT;
+        auto conv = dstOp->main.AsConvolution2D();
+        conv->common.reset(new MNN::Convolution2DCommonT);
+        auto common = conv->common.get();
+        common->strideX = params.stride_width;
+        common->strideY = params.stride_height;
+        switch (params.padding) {
+            case 0:
+                common->padMode = MNN::PadMode_CAFFE;
+                break;
+            case 1:
+                common->padMode = MNN::PadMode_SAME;
+                break;
+            case 2:
+                common->padMode = MNN::PadMode_VALID;
+                break;
+            default:
+                break;
+        }
+        const int inputIndex     = tfliteOp->inputs[0];
+        const int weightIndex    = tfliteOp->inputs[1];
+        const int biasIndex    = tfliteOp->inputs[2];
+        const int outputIndex    = tfliteOp->outputs[0];
+        const auto& inputTensor  = tfliteTensors[inputIndex];
+        const auto& weightTensor = tfliteTensors[weightIndex];
+        const auto& biasTensor = tfliteTensors[biasIndex];
+        
+        const auto& weightShape = weightTensor->shape;
+        DCHECK(weightShape.size() == 4) << "Conv2D weight ERROR!";
+        const int co         = weightShape[0];
+        const int kh         = weightShape[1];
+        const int kw         = weightShape[2];
+        const int ci         = weightShape[3];
+        
+        // TODO: Support group
+        common->group = 1;
+        common->outputCount = co;
+        common->inputCount = ci;
+        common->kernelX = kw;
+        common->kernelY = kh;
+        
+        flatbuffers::FlatBufferBuilder builder;
+        builder.Finish(MNN::Op::Pack(builder, dstOp));
+        dstOp->type = MNN::OpType_Extra;
+        dstOp->main.Reset();
+        dstOp->main.value = new MNN::ExtraT;
+        dstOp->main.type = MNN::OpParameter_Extra;
+        auto extra = dstOp->main.AsExtra();
+        extra->type = "Convolution2DTransposeBias";
+        extra->engine = "Tflite";
+        extra->info.resize(builder.GetSize());
+        ::memcpy(extra->info.data(), builder.GetBufferPointer(), builder.GetSize());
+        return;
+    }
     DCHECK(customOPCode == "TFLite_Detection_PostProcess")
         << "Now Only support Custom op of 'TFLite_Detection_PostProcess'";
 
diff --git a/tools/cpp/ExprDebug.hpp b/tools/cpp/ExprDebug.hpp
index 2b5688d58..280626544 100644
--- a/tools/cpp/ExprDebug.hpp
+++ b/tools/cpp/ExprDebug.hpp
@@ -61,7 +61,7 @@ static void dumpTensor2File(const MNN::Tensor* tensor, const char* file, std::of
     }
 }
 
-std::ofstream gOrderFile;
+static std::ofstream gOrderFile;
 static void _initDebug() {
     gOrderFile.open("order.txt");
     MNN::TensorCallBackWithInfo beforeCallBack = [&](const std::vector<MNN::Tensor*>& ntensors, const MNN::OperatorInfo* info) {
@@ -133,7 +133,7 @@ static void _initDebug() {
 
 struct TimeTraceInfo {
     std::map<std::string, std::map<std::string, std::vector<std::pair<float, float>>>> mTypes;
-    
+
     void begin(const MNN::OperatorInfo* info) {
         auto tIter = mTypes.find(info->type());
         if (tIter == mTypes.end()) {
@@ -191,7 +191,7 @@ std::tuple<float, float, float> _countTensor(MNN::Tensor* tensor) {
     return std::make_tuple(maxValue, minValue, avgValue);
 }
 
-std::pair<bool, std::tuple<float, float, float>> _countForTensorValid(MNN::Tensor* ntensor) {
+static std::pair<bool, std::tuple<float, float, float>> _countForTensorValid(MNN::Tensor* ntensor) {
     bool valid = false;
     std::tuple<float, float, float> res;
     if (ntensor->elementSize() <= 0) {
diff --git a/tools/cpp/LoRA.cpp b/tools/cpp/LoRA.cpp
index c2fc7137f..558e0fa08 100644
--- a/tools/cpp/LoRA.cpp
+++ b/tools/cpp/LoRA.cpp
@@ -154,7 +154,8 @@ void LoRA::apply_external(MNN::OpT* op, MNN::OpT* lora_A, MNN::OpT* lora_B) {
     auto& quan = param->quanParameter;
     size_t weightLength = 0;
     auto ptr = reinterpret_cast<unsigned char*>(result->weight.get());
-    auto new_ptr = IDSTDecoder::ReadQuanData_c(ptr, &weightLength, result.get(), quan->shapeInt32);
+    std::unique_ptr<MemoryLoader> loader(new MemoryLoader(ptr));
+    auto new_ptr = IDSTDecoder::ReadQuanData_c(loader.get(), &weightLength, result.get(), quan->shapeInt32, false);
     result->weight.set(new_ptr, weightLength);
     result->weightFloat.reset(weightLength);
     // dequant to float
diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp
index 4ae33f3d8..9551b7d6f 100644
--- a/tools/quantization/calibration.cpp
+++ b/tools/quantization/calibration.cpp
@@ -239,7 +239,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
 
     _imageProcessConfig.sourceFormat = RGBA;
     _calibrationFileNum = 0;
-    
+
     if (picObj.HasMember("mean")) {
         auto mean = picObj["mean"].GetArray();
         int cur   = 0;
@@ -351,7 +351,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
             _inputType = Helper::InputType::SEQUENCE;
         }
     }
-    
+
     _module.reset(Module::load({}, {}, originalModelFile.c_str()));
     auto moduleInfo = _module->getInfo();
     for (int i = 0; i < moduleInfo->inputNames.size(); ++i) {
@@ -405,7 +405,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
         }
         mInputShape.insert(std::make_pair(name, shape));
     }
-    
+
     std::shared_ptr<ImageProcess> process(ImageProcess::create(_imageProcessConfig), ImageProcess::destroy);
     _process = process;
 
@@ -432,7 +432,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
             }
         }
     }
-    
+
     MNN::ScheduleConfig config;
     config.backupType = MNN_FORWARD_CPU;
     config.numThread = 1;
@@ -558,7 +558,7 @@ void Calibration::_initMaps() {
 void Calibration::_computeFeatureMapsRange() {
     // feed input data according to input images
     int count = 0;
-    
+
     auto netInfo = _module->getInfo();
     for (const auto& file: _calibrationFiles) {
         std::vector<VARP> inputs;
@@ -568,7 +568,7 @@ void Calibration::_computeFeatureMapsRange() {
         for (auto& iter : _featureInfo) {
             iter.second->resetUpdatedRangeFlags();
         }
-        
+
         if (_inputType == Helper::SEQUENCE) {
             inputs = getModuleInputs(file, netInfo, mInputNames, mInputShape);
             for (int i = 0; i < inputs.size(); ++i) {
@@ -880,11 +880,15 @@ void Calibration::_insertScale() {
         std::unique_ptr<Tensor> externalWeightTensor, externalBiasTensor;
         if (nullptr != conv2d->quanParameter.get()) {
             flatbuffers::FlatBufferBuilder tempBuilder;
+            /*
             tempBuilder.Finish(IDSTQuan::Pack(tempBuilder, conv2d->quanParameter.get()));
             tempBuilder.Finish(Convolution2D::Pack(tempBuilder, conv2d));
             auto conv2d = flatbuffers::GetRoot<Convolution2D>(tempBuilder.GetBufferPointer());
+            */
+            tempBuilder.Finish(Op::Pack(tempBuilder, op.get()));
+            auto pack_op = flatbuffers::GetRoot<Op>(tempBuilder.GetBufferPointer());
             bool forceFloat = true;
-            quanCommon = ConvolutionCommon::load(conv2d, nullptr, true, true);
+            quanCommon = ConvolutionCommon::load(pack_op, nullptr, true, true);
             // Back to float
             originWeight     = quanCommon->weightFloat.get();
             originWeightSize = quanCommon->weightFloat.size();
@@ -975,7 +979,7 @@ void Calibration::_computeQuantError() {
 
     for (const auto& file : _calibrationFiles) {
         count++;
-        
+
         for (auto& iter : _featureInfo) {
             iter.second->setVisited(false);
         }
@@ -1112,12 +1116,12 @@ void Calibration::_quantizeModelEMA() {
                         }
                     }
                 }
-                
+
                 for (int i = 0; i < inputs.size(); ++i) {
                     auto name = varInputs[i]->name();
                     auto input = _Input(dyInputShape[name], varInputs[i]->getInfo()->order, varInputs[i]->getInfo()->type);
                     std::string fileName = file + "/" + name + ".txt";
-                    
+
                     auto inputTensor = (MNN::Tensor*)input->getTensor();
                     Helper::preprocessInput(_process.get(), _preprocessConfig, fileName, inputTensor, _inputType);
                     ::memcpy(input->writeMap<float>(), inputTensor->host<float>(), inputTensor->elementSize() * sizeof(float));
@@ -1128,7 +1132,7 @@ void Calibration::_quantizeModelEMA() {
                 auto inputTensor = (MNN::Tensor*)singleInput->getTensor();
                 Helper::preprocessInput(_process.get(), _preprocessConfig, file, inputTensor, _inputType);
                 ::memcpy(inputs[0]->writeMap<float>() + k * inputTensor->elementSize(), inputTensor->host<float>(), inputTensor->elementSize() * sizeof(float));
-                
+
             }
         }
         auto predicts = _module->onForward(inputs);
@@ -1151,9 +1155,9 @@ void Calibration::_quantizeModelEMA() {
         input->setName(name);
         inputsForward[i] = input;
     }
-    
+
     auto predicts = _module->onForward(inputsForward);
-    
+
     Transformer::turnModelToInfer()->onExecute(predicts);
     for (int i = 0; i < predicts.size(); i++) {
         predicts[i]->setName(varOutputs[i]->name());
diff --git a/tools/script/apply_gptq.py b/tools/script/apply_gptq.py
index 3f4727f08..d3805a024 100644
--- a/tools/script/apply_gptq.py
+++ b/tools/script/apply_gptq.py
@@ -3,10 +3,16 @@
 import argparse
 
 class MNNWeight:
-    def __init__(self, name, external, a_min):
+    def __init__(self, name, external, weight_elements):
         self.name = name
         self.external = external
-        self.a_min = a_min
+        self.quant_bits = 4
+        if round(weight_elements / external[1]) == 2:
+            self.quant_bits = 4
+            self.a_min = -8
+        else:
+            self.quant_bits = 8
+            self.a_min = -128
         self.parse_name()
 
     def __repr__(self) -> str:
@@ -23,7 +29,9 @@ def parse_name(self):
             self.op_id = parts[2]
             self.block_id = parts[-1].split('__')[-1]
 
-    def key(self): return f'{self.layer_id}.{self.op_id}'
+    def key(self):
+        if self.layer_id == -1: return self.op_id
+        return f'{self.layer_id}.{self.op_id}'
     def idx(self): return int(self.block_id)
     def offset(self): return self.external[0]
     def weight_size(self): return self.external[1]
@@ -38,10 +46,8 @@ def weight_reorder(qweight, bits=4, group_size=128):
     if bits == 8:
         weight = weight.to(torch.uint8)
         return weight
-    if bits == 4:
-        weight = weight.reshape(-1, 2).to(torch.uint8)
-        weight = weight[:, 0] * 16 + weight[:, 1]
-        return weight
+    weight = weight.reshape(-1, 2).to(torch.uint8)
+    weight = weight[:, 0] * 16 + weight[:, 1]
     return weight
 
 class MNNModel:
@@ -56,8 +62,8 @@ def parse_conv(self):
             if op['type'] == 'Convolution':
                 name = op['name']
                 external = op['main']['external']
-                a_min = op['main']['quanParameter']['aMin']
-                self.weights.append(MNNWeight(name, external, a_min))
+                weight_elements = op['main']['common']['outputCount'] * op['main']['common']['inputCount']
+                self.weights.append(MNNWeight(name, external, weight_elements))
 
     def apply_weight_split(self, gptq_tensor):
         bin_file = open(self.external_weight, 'r+b')
@@ -69,7 +75,7 @@ def apply_weight_split(self, gptq_tensor):
             weight = gptq_weight.weight(idx)
             scale = gptq_weight.scale(idx).float()
             # write weight data
-            weight = weight_reorder(weight, self.quant_bits)
+            weight = weight_reorder(weight, mnn_weight.quant_bits)
             weight_bytes = weight.numpy().tobytes()
             weight_size = mnn_weight.weight_size()
             header_len = weight_size - len(weight_bytes)
@@ -95,10 +101,11 @@ def apply_weight(self, gptq_tensor):
             gptq_weight = gptq_tensor.get(mnn_weight.key())
             if gptq_weight is None: continue
             print(f'write {mnn_weight.key()} ... ', end='')
+            # print(f'mnn_weight.quant_bits = {mnn_weight.quant_bits}')
             weight = gptq_weight.qweight
             scale = gptq_weight.scales.float().transpose(1, 0)
             # write weight data
-            weight = weight_reorder(weight, self.quant_bits)
+            weight = weight_reorder(weight, mnn_weight.quant_bits)
             weight_bytes = weight.numpy().tobytes()
             weight_size = mnn_weight.weight_size()
             header_len = weight_size - len(weight_bytes)
@@ -117,8 +124,7 @@ def apply_weight(self, gptq_tensor):
             print('Done!')
         bin_file.close()
 
-    def apply(self, gptq_tensor, quant_bits):
-        self.quant_bits = quant_bits
+    def apply(self, gptq_tensor):
         if self.weights[0].block_id.isdigit():
             self.apply_weight_split(gptq_tensor)
         else:
@@ -153,6 +159,8 @@ def __init__(self, file):
 
     def prefix(self, name):
         splits = name.split('.')
+        if 'lm_head' in splits[0] and len(splits) == 2:
+            return splits[0], splits[1]
         if len(splits) < 5:
             return None, None
         pre = f'{splits[2]}.{splits[3]}.{splits[4]}'
@@ -182,13 +190,12 @@ def load(self):
 def main(args):
     mnn_model = MNNModel(args.mnn_graph, args.mnn_weight)
     gptq_weight = GPTQTensor(args.gptq_tensor)
-    mnn_model.apply(gptq_weight, args.quant_bits)
+    mnn_model.apply(gptq_weight)
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='apply_gptq', formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument('--mnn_graph', type=str, required=True, help='mnn graph json path.')
     parser.add_argument('--mnn_weight', type=str, required=True, help='mnn weight file path.')
     parser.add_argument('--gptq_tensor', type=str, required=True, help='gptq tensor path.')
-    parser.add_argument('--quant_bits', type=int, default=4, help='quant bits, default is 4.')
     args = parser.parse_args()
     main(args)
diff --git a/tools/script/apply_lora.py b/tools/script/apply_lora.py
new file mode 100644
index 000000000..e0e03e79f
--- /dev/null
+++ b/tools/script/apply_lora.py
@@ -0,0 +1,156 @@
+import os
+import json
+import argparse
+
+class Base:
+    def __init__(self, path, fuse_lora):
+        self.fuse_lora = fuse_lora
+        self.load(path)
+
+    def __str__(self):
+        return str(self.lora_keys)
+
+    def load(self, path):
+        self.base_model = json.load(open(path, 'rt'))
+
+    def build_conv(self, input_index, output_name, dims, weight, mul_scale = 1.0):
+        output_index = len(self.base_model['tensorName'])
+        oc, ic = dims
+        bias = [0.0 for i in range(oc)]
+        if mul_scale != 1.0:
+            weight = [w * mul_scale for w in weight]
+        op = {
+            'type': 'Convolution',
+            'name': output_name,
+            'inputIndexes': [input_index],
+            'outputIndexes': [ output_index ],
+            'main_type': 'Convolution2D',
+            'main': {
+                'common': {
+                    'dilateX': 1, 'dilateY': 1, 'strideX': 1, 'strideY': 1,
+                    'kernelX': 1, 'kernelY': 1, 'padX': 0, 'padY': 0, 'group': 1,
+                    'outputCount': oc, 'relu': False, 'padMode': 'CAFFE',
+                    'relu6': False, 'inputCount': ic, 'hasOutputShape': False
+                },
+                "weight": weight,
+                "bias": bias
+            },
+            'defaultDimentionFormat': 'NHWC'
+        }
+        self.base_model['oplists'].insert(self.idx, op)
+        self.idx += 1
+        self.base_model['tensorName'].append(output_name)
+        return output_index
+
+    def build_binary(self, op_type, input_indexes, output_name):
+        # 0: Add, 2: Mul
+        output_index = len(self.base_model['tensorName'])
+        op = {
+            "type": "BinaryOp",
+            "name": output_name,
+            "inputIndexes": input_indexes,
+            "outputIndexes": [ output_index ],
+            "main_type": "BinaryOp",
+            "main": { "opType": 0, "T": "DT_FLOAT", "activationType": 0 },
+            "defaultDimentionFormat": "NHWC"
+        }
+        self.base_model['oplists'].insert(self.idx, op)
+        self.idx += 1
+        self.base_model['tensorName'].append(output_name)
+        return output_index
+
+    def replace_input(self, origin_idx, new_idx):
+        for op in self.base_model['oplists']:
+            if op['type'] == 'ConvertTensor' and origin_idx in op['inputIndexes']:
+                op['inputIndexes'] = [new_idx]
+
+    def apply_lora(self, op, lora):
+        names = op['name'].split('/')
+        mul_scale = lora.scale
+        tag = names[1].split('.')[1] + names[3]
+        lora_a, lora_b = lora.get_lora(tag)
+        input_index = op['inputIndexes'][0]
+        outpt_index = op['outputIndexes'][0]
+        if self.fuse_lora:
+            w = (lora_a @ lora_b)
+            weight = w.reshape(-1).tolist()
+            b_out = self.build_conv(input_index, f'{tag}_B', w.shape, weight, mul_scale)
+            n_out = self.build_binary(0, [outpt_index, b_out], f'{tag}_add')
+            self.replace_input(outpt_index, n_out)
+            return
+        # lora_B @ lora_A @ x -> lora_B @ (lora_A @ x)
+        a_out = self.build_conv(input_index, f'{tag}_A', list(lora_a.shape), lora_a.flatten().tolist())
+        b_out = self.build_conv(a_out, f'{tag}_B', list(lora_b.shape), lora_b.flatten().tolist(), mul_scale)
+        n_out = self.build_binary(0, [outpt_index, b_out], f'{tag}_add')
+        self.replace_input(outpt_index, n_out)
+
+    def apply(self, lora, out):
+        ops = []
+        for i in range(len(self.base_model['oplists'])):
+            op = self.base_model['oplists'][i]
+            if op['type'] == 'Convolution':
+                if lora.has_lora(op['name']):
+                    self.idx = i + 1
+                    self.apply_lora(op, lora)
+        with open(out, 'w', encoding='utf-8') as file:
+            json.dump(self.base_model, file, ensure_ascii=False, indent=4)
+
+class LoRA:
+    def __init__(self, path, scale):
+        self.lora_A = {}
+        self.lora_B = {}
+        self.lora_keys = set()
+        self.scale = scale
+        self.load(path)
+
+    def __str__(self):
+        return str(self.lora_keys)
+
+    def has_lora(self, op_name):
+        if op_name[0] != '/':
+            return False
+        for key in self.lora_keys:
+            if key in op_name:
+                return True
+        return False
+
+    def get_lora(self, tag):
+        lora_a, lora_b = self.lora_A[tag], self.lora_B[tag]
+        return lora_a, lora_b
+
+    def load(self, path):
+        if os.path.isdir(path):
+            base_dir = path
+            config = json.load(open(os.path.join(base_dir, 'adapter_config.json'), 'rt'))
+            lora_alpha = config['lora_alpha']
+            r = config['r']
+            self.scale = float(lora_alpha) / r
+            print(self.scale)
+            path = os.path.join(base_dir, 'adapter_model.safetensors')
+        from safetensors import safe_open
+        with safe_open(path, framework="pt") as f:
+            for k in f.keys():
+                names = k.split('.')
+                layer, key, name = names[4], names[6], names[7]
+                tag = layer + key
+                tensor = f.get_tensor(k).float()
+                self.lora_keys.add(key)
+                if 'lora_A' == name:
+                    self.lora_A[tag] = tensor
+                else:
+                    self.lora_B[tag] = tensor
+
+def main(args):
+    base = Base(args.base, args.fuse)
+    lora = LoRA(args.lora, args.scale)
+    base.apply(lora, args.out)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='apply_lora', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--base', type=str, required=True, help='base model json path.')
+    parser.add_argument('--lora', type=str, required=True, help='lora dir path or *.safetensors path.')
+    parser.add_argument('--scale', type=float, default=4.0, help='lora scale: `alpha/r`.')
+    parser.add_argument('--fuse', type=bool, default=False, help='fuse A and B.')
+    parser.add_argument('--out', type=str, default='lora.json', help='out file name.')
+    args = parser.parse_args()
+    main(args)
diff --git a/tools/script/arm_assembly.py b/tools/script/arm_assembly.py
index 0a449a263..3b53853a7 100644
--- a/tools/script/arm_assembly.py
+++ b/tools/script/arm_assembly.py
@@ -5,7 +5,8 @@ def __init__(self, src_path, dst_path):
         self.src_path = src_path
         self.dst_path = dst_path
         # instructions
-        self.ops = ['sdot', 'smmla', 'bfmmla']
+        self.ops = ['sdot', 'smmla', 'bfmmla', 'mov']
+
     def assembly(self):
         self.dst_content = []
         src = open(self.src_path, 'rt')
@@ -14,36 +15,46 @@ def assembly(self):
             cmd = code.strip().split(' ')
             for op in self.ops:
                 if cmd[0] == op:
-                    inst = getattr(self, op)(cmd[1], cmd[2], cmd[3])
-                    code = code[:code.find(op)] + inst + ' // ' + code.strip(' ')
+                    if op == 'mov':
+                        code = getattr(self, op)(code, cmd[1], cmd[2])
+                    else:
+                        inst = getattr(self, op)(cmd[1], cmd[2], cmd[3])
+                        code = code[:code.find(op)] + inst + ' // ' + code.strip(' ')
             self.dst_content.append(code)
         src.close()
         self.write()
+
     def write(self):
         dst = open(self.dst_path, 'wt')
         dst.writelines(self.dst_content)
         dst.close()
+
     # asm parse helper function
     def gen_inst(self, opcode, flag, r1, r2, r3):
         cmd = opcode + r1 + flag + r2 + r3
         inst = '.inst ' + str(hex(int(cmd, 2)))
         return inst
+
     def register_to_bin(self, register):
         assert(register[0] == 'v')
         id = str(bin(int(register[1:])))[2:]
         id = '0' * (5 - len(id)) + id
         return id
+
     def operand_spilt(self, operand):
         v, t = operand.split('.')
         return self.register_to_bin(v), t
+
     def operand_to_bin(self, operand):
         r, _ = self.operand_spilt(operand)
         return r
+
     def t_split(self, t):
         idx = None
         if t[-1] == ']':
             t, offset = t[:-1].split('[')
         return t, int(offset)
+
     # instruction code gen function
     def sdot(self, operand1, operand2, operand3):
         # SDOT <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tc>[offset]
@@ -74,7 +85,6 @@ def sdot(self, operand1, operand2, operand3):
             # set Q
             if "2s" in Ta and "8b" in Tb:
                 opcode[1] = '0'
-            
             opcode = ''.join(opcode)
             flag = ''.join(flag)
             return self.gen_inst(opcode, flag, Vm, Vn, Vd)
@@ -87,6 +97,7 @@ def smmla(self, operand1, operand2, operand3):
         Vn = self.operand_to_bin(operand2)
         Vm = self.operand_to_bin(operand3)
         return self.gen_inst(opcode, flag, Vm, Vn, Vd)
+
     def bfmmla(self, operand1, operand2, operand3):
         # BFMMLA <Vd>.4S, <Vn>.8H, <Vm>.8H
         opcode = '01101110010'
@@ -96,6 +107,17 @@ def bfmmla(self, operand1, operand2, operand3):
         Vm = self.operand_to_bin(operand3)
         return self.gen_inst(opcode, flag, Vm, Vn, Vd)
 
+    def mov(self, code, operand1, operand2):
+        # compile failed using `mov v1.8h, v2.8h`
+        # change to `mov v1.16b, v2.16b`
+        if '.8h' not in operand1 or '.8h' not in operand2:
+            return code
+        operand1 = operand1.replace('8h', '16b')
+        operand2 = operand2.replace('8h', '16b')
+        new_mov = f'mov {operand1} {operand2}'
+        new_code = code[:code.find('mov')] + new_mov + ' // ' + code.strip(' ')
+        return new_code
+
 if __name__ == '__main__':
     if len(sys.argv) < 2:
         print('Usage: python arm_asselmbly.py src.asm [dst.asm]')
diff --git a/tools/script/convertOnnxTest.py b/tools/script/convertOnnxTest.py
index c409a8ebc..f0ace5aa9 100755
--- a/tools/script/convertOnnxTest.py
+++ b/tools/script/convertOnnxTest.py
@@ -35,5 +35,6 @@ def run_cmd(args):
 for w in gWrong:
     print(w)
 print('TEST_NAME_MODULE: 模型测试\nTEST_CASE_AMOUNT_MODULE: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+print('TEST_CASE={\"name\":\"Onnx转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
 if len(gWrong) > 0:
     exit(1)
diff --git a/tools/script/convertTfTest.py b/tools/script/convertTfTest.py
index 2c8811eab..e178409c9 100755
--- a/tools/script/convertTfTest.py
+++ b/tools/script/convertTfTest.py
@@ -34,5 +34,6 @@ def run_cmd(args):
 for w in gWrong:
     print(w)
 print('TEST_NAME_TF: TFConvert测试\nTEST_CASE_AMOUNT_TF: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+print('TEST_CASE={\"name\":\"Tensorflow转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
 if len(gWrong) > 0:
     exit(1)
diff --git a/tools/script/convertTfliteTest.py b/tools/script/convertTfliteTest.py
index f25257c1c..8486b2a2f 100755
--- a/tools/script/convertTfliteTest.py
+++ b/tools/script/convertTfliteTest.py
@@ -33,5 +33,6 @@ def run_cmd(args):
 for w in gWrong:
     print(w)
 print('TEST_NAME_TFLITE: TFLITEConvert测试\nTEST_CASE_AMOUNT_TFLITE: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+print('TEST_CASE={\"name\":\"Tflite转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
 if len(gWrong) > 0:
     exit(1)
diff --git a/tools/script/convertTorchTest.py b/tools/script/convertTorchTest.py
index eb3450b4d..56a97bfe6 100755
--- a/tools/script/convertTorchTest.py
+++ b/tools/script/convertTorchTest.py
@@ -33,5 +33,6 @@ def run_cmd(args):
 for w in gWrong:
     print(w)
 print('TEST_NAME_TORCH: TORCHConvert测试\nTEST_CASE_AMOUNT_TORCH: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+print('TEST_CASE={\"name\":\"TorchScript转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
 if len(gWrong) > 0:
     exit(1)
diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py
index bb061f40f..056fa7fef 100755
--- a/tools/script/modelTest.py
+++ b/tools/script/modelTest.py
@@ -186,6 +186,7 @@ def run_cmd(args):
 if runStatic:
     flag = 'STATIC'
 print('TEST_NAME_MODEL%s: 模型测试%s\nTEST_CASE_AMOUNT_MODEL%s: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(flag, flag, flag, len(gWrong), total_num - len(gWrong)))
+print('TEST_CASE={\"name\":\"模型测试%s\",\"failed\":%d,\"passed\":%d}\n'%(flag, len(gWrong), total_num - len(gWrong)))
 if len(gWrong) > 0:
     exit(1)
 
diff --git a/tools/script/testPTQ.py b/tools/script/testPTQ.py
index a43050a05..f88e9706a 100755
--- a/tools/script/testPTQ.py
+++ b/tools/script/testPTQ.py
@@ -25,7 +25,7 @@ def parseRes(res):
         point = float(item[splitIdx+1:])
         idxs.add(idx)
         avgp += point
-    avgp /= len(items) 
+    avgp /= len(items)
     return idxs, avgp
 
 def compare(origin, quant, jsonFile):
@@ -38,7 +38,7 @@ def compare(origin, quant, jsonFile):
         quantIdx, quantPoint = parseRes(quant_res)
         print(originIdx, originPoint)
         print(quantIdx, quantPoint)
-        idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx))    
+        idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx))
         pointRate = quantPoint / originPoint
         print(name, idxRate, pointRate)
         if idxRate < 0.5:
@@ -94,6 +94,10 @@ def testacc(modelpath, imagepath, path, labelpath):
     with open(jsonFile) as f:
         jsonObj = json.loads(f.read())
     originModel = modelpath + jsonObj['model']
+    jsonObj['path'] = imagepath
+    jsonFile = './__quantized.json'
+    with open(jsonFile, 'w', encoding='utf-8') as fp:
+        json.dump(jsonObj, fp, ensure_ascii=False, indent=4)
     quantModel  = './__quantModel.mnn'
     message = run_cmd(['./quantized.out', originModel, quantModel, jsonFile])
     res = True
@@ -110,7 +114,7 @@ def testacc(modelpath, imagepath, path, labelpath):
     model_root_dir = sys.argv[1]
     root_dir = os.path.join(model_root_dir, 'TestPTQ')
     print('root: ' + root_dir + '\n')
-    
+
     gWrong = []
     for name in os.listdir(root_dir + '/json'):
         if '.DS_Store' in name:
@@ -123,6 +127,7 @@ def testacc(modelpath, imagepath, path, labelpath):
     for w in gWrong:
         print(w)
     print('TEST_NAME_PTQ: PTQ测试\nTEST_CASE_AMOUNT_PTQ: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+    print('TEST_CASE={\"name\":\"PTQ测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
     if len(gWrong) > 0:
         exit(1)
 
@@ -139,5 +144,6 @@ def testacc(modelpath, imagepath, path, labelpath):
     for w in gWrong:
         print(w)
     print('BATCH_TEST_NAME_PTQ: PTQ测试\nTEST_CASE_AMOUNT_PTQ: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong)))
+    print('TEST_CASE={\"name\":\"BATCH-PTQ测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong)))
     if len(gWrong) > 0:
         exit(1)
diff --git a/tools/train/register.py b/tools/train/register.py
new file mode 100644
index 000000000..780da5c64
--- /dev/null
+++ b/tools/train/register.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+import os
+
+def generateGradFile(rootDir):
+    geoDir = os.path.join(rootDir, "source", "grad")
+    regFile = os.path.join(geoDir, "GradOPRegister.cpp")
+    fileNames = os.listdir(geoDir)
+    print(fileNames)
+    if len(fileNames) <= 1:
+        # Error dirs
+        return
+    funcNames = []
+    for fi in fileNames:
+        if ".cpp" not in fi:
+            continue
+        f = os.path.join(geoDir, fi)
+        if os.path.isdir(f):
+            continue
+        with open(f) as fileC:
+            c = fileC.read().split('\n')
+            c = list(filter(lambda l:l.find('REGISTER_GRAD')>=0, c))
+            for l in c:
+                l = l.split('(')[1]
+                l = l.split(')')[0]
+                l = l.replace(' ', '')
+                l = l.split(',')
+                funcName = '___' + l[0] + '__' + l[1] + '__'
+                funcNames.append(funcName)
+
+    with open(regFile, 'w') as f:
+        f.write('// This file is generated by Shell for ops register\n')
+        f.write('#include \"OpGrad.hpp\"\n')
+        f.write('namespace MNN {\n')
+        for l in funcNames:
+            f.write("extern void " + l + '();\n')
+        f.write('\n')
+        f.write('void registerGradOps() {\n')
+        for l in funcNames:
+            f.write(l+'();\n')
+        f.write("}\n}\n")
+
+
+import sys
+generateGradFile(sys.argv[1])
diff --git a/tools/train/source/demo/MobilenetV2Utils.cpp b/tools/train/source/demo/MobilenetV2Utils.cpp
index 696f52bcc..53bafb50a 100644
--- a/tools/train/source/demo/MobilenetV2Utils.cpp
+++ b/tools/train/source/demo/MobilenetV2Utils.cpp
@@ -16,7 +16,7 @@
 #include "DemoUnit.hpp"
 #include "NN.hpp"
 #include "SGD.hpp"
-#define MNN_OPEN_TIME_TRACE
+//#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include "ADAM.hpp"
 #include "LearningRateScheduler.hpp"
@@ -31,23 +31,26 @@ using namespace MNN;
 using namespace MNN::Express;
 using namespace MNN::Train;
 
-void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses, const int addToLabel,
+void MobilenetV2Utils::train(MNNForwardType backend, int threadNumber, std::shared_ptr<Module> model, const int numClasses, const int addToLabel,
                                 std::string trainImagesFolder, std::string trainImagesTxt,
-                                std::string testImagesFolder, std::string testImagesTxt, const int quantBits) {
+                                std::string testImagesFolder, std::string testImagesTxt, const int quantBits, int size) {
     auto exe = Executor::getGlobalExecutor();
     BackendConfig config;
-    exe->setGlobalExecutorConfig(MNN_FORWARD_USER_1, config, 2);
-    std::shared_ptr<SGD> solver(new SGD(model));
+    exe->setGlobalExecutorConfig(backend, config, threadNumber);
+    std::shared_ptr<SGD> solver(new ADAM(model));
     solver->setMomentum(0.9f);
     // solver->setMomentum2(0.99f);
     solver->setWeightDecay(0.00004f);
 
     auto converImagesToFormat  = CV::RGB;
-    int resizeHeight           = 224;
-    int resizeWidth            = 224;
-    std::vector<float> means = {127.5, 127.5, 127.5};
-    std::vector<float> scales = {1/127.5, 1/127.5, 1/127.5};
-    std::vector<float> cropFraction = {0.875, 0.875}; // center crop fraction for height and width
+    int resizeHeight           = size;
+    int resizeWidth            = size;
+    std::vector<float> means = {127.5f, 127.5f, 127.5f};
+    std::vector<float> scales = {1/127.5f, 1/127.5f, 1/127.5f};
+    std::vector<float> cropFraction = {0.875f, 0.875f}; // center crop fraction for height and width
+    if (size == 32) {
+        cropFraction = {1.0f, 1.0f};
+    }
     bool centerOrRandomCrop = false; // true for random crop
     std::shared_ptr<ImageDataset::ImageConfig> datasetConfig(ImageDataset::ImageConfig::create(converImagesToFormat, resizeHeight, resizeWidth, scales, means,cropFraction, centerOrRandomCrop));
     bool readAllImagesToMemory = false;
@@ -70,7 +73,6 @@ void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses
 
     for (int epoch = 0; epoch < 50; ++epoch) {
         model->clearCache();
-        exe->gc(Executor::FULL);
         {
             AUTOTIME;
             trainDataLoader->reset();
@@ -79,16 +81,13 @@ void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses
                 AUTOTIME;
                 auto trainData  = trainDataLoader->next();
                 auto example    = trainData[0];
-
                 // Compute One-Hot
                 auto newTarget = _OneHot(_Cast<int32_t>(_Squeeze(example.second[0] + _Scalar<int32_t>(addToLabel), {})),
                                   _Scalar<int>(numClasses), _Scalar<float>(1.0f),
                                          _Scalar<float>(0.0f));
-
-                auto predict = model->forward(_Convert(example.first[0], NC4HW4));
+                auto predict = _Convert( model->forward(_Convert(example.first[0], NC4HW4)), NCHW);
                 auto loss    = _CrossEntropy(predict, newTarget);
-                // float rate   = LrScheduler::inv(0.0001, solver->currentStep(), 0.0001, 0.75);
-                float rate = 1e-5;
+                float rate   = LrScheduler::inv(0.0001, solver->currentStep(), 0.0001, 0.75);
                 solver->setLearningRate(rate);
                 if (solver->currentStep() % 10 == 0) {
                     std::cout << "train iteration: " << solver->currentStep();
@@ -96,6 +95,7 @@ void MobilenetV2Utils::train(std::shared_ptr<Module> model, const int numClasses
                     std::cout << " lr: " << rate << std::endl;
                 }
                 solver->step(loss);
+                exe->gc(Executor::FULL);
             }
         }
 
diff --git a/tools/train/source/demo/MobilenetV2Utils.hpp b/tools/train/source/demo/MobilenetV2Utils.hpp
index 67cec7939..196ee2e1d 100644
--- a/tools/train/source/demo/MobilenetV2Utils.hpp
+++ b/tools/train/source/demo/MobilenetV2Utils.hpp
@@ -14,9 +14,9 @@
 
 class MobilenetV2Utils {
 public:
-    static void train(std::shared_ptr<MNN::Express::Module> model, const int numClasses, const int addToLabel,
+    static void train(MNNForwardType backend, int threadNumber, std::shared_ptr<MNN::Express::Module> model, const int numClasses, const int addToLabel,
                       std::string trainImagesFolder, std::string trainImagesTxt,
-                      std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8);
+                      std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8, int size = 224);
 };
 
 #endif
diff --git a/tools/train/source/demo/demoMain.cpp b/tools/train/source/demo/demoMain.cpp
index 30c844e75..701bfef6f 100644
--- a/tools/train/source/demo/demoMain.cpp
+++ b/tools/train/source/demo/demoMain.cpp
@@ -10,7 +10,7 @@
 #include "DemoUnit.hpp"
 #include <MNN/expr/ExecutorScope.hpp>
 int main(int argc, const char* argv[]) {
-//    ExecutorScope::Current()->setLazyComputeMode(MNN::Express::Executor::LAZY_CONTENT);
+    ExecutorScope::Current()->setLazyComputeMode(MNN::Express::Executor::LAZY_COMPUTE_ONCE);
     if (argc < 2) {
         MNN_ERROR("Usage: ./runTrainDemo.out CASENAME [ARGS]\n");
         auto& list = DemoUnitSet::get()->list();
diff --git a/tools/train/source/demo/mobilenetV2Train.cpp b/tools/train/source/demo/mobilenetV2Train.cpp
index 50fb137b1..a98170d6d 100644
--- a/tools/train/source/demo/mobilenetV2Train.cpp
+++ b/tools/train/source/demo/mobilenetV2Train.cpp
@@ -59,6 +59,13 @@ class MobilenetV2Transfer : public DemoUnit {
                       << std::endl;
             return 0;
         }
+        MNNForwardType type = MNN_FORWARD_CPU;
+        if (argc >= 7) {
+            std::istringstream is(argv[6]);
+            int c;
+            is >> c;
+            type = (MNNForwardType)c;
+        }
 
         std::string trainImagesFolder = argv[2];
         std::string trainImagesTxt = argv[3];
@@ -67,7 +74,7 @@ class MobilenetV2Transfer : public DemoUnit {
 
         std::shared_ptr<Module> model(new MobilenetV2TransferModule(argv[1]));
 
-        MobilenetV2Utils::train(model, 4, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
+        MobilenetV2Utils::train(type, 4, model, 4, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
 
         return 0;
     }
@@ -80,6 +87,14 @@ class MobilenetV2Train : public DemoUnit {
             std::cout << "usage: ./runTrainDemo.out MobilenetV2Train path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" << std::endl;
             return 0;
         }
+        MNNForwardType type = MNN_FORWARD_CPU;
+        if (argc >= 6) {
+            std::istringstream is(argv[5]);
+            int c;
+            is >> c;
+            type = (MNNForwardType)c;
+        }
+
         // global random number generator, should invoke before construct the model and dataset
         RandomGenerator::generator(17);
 
@@ -90,51 +105,57 @@ class MobilenetV2Train : public DemoUnit {
 
         std::shared_ptr<Module> model(new MobilenetV2);
 
-        MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
+        MobilenetV2Utils::train(type, 4, model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
 
         return 0;
     }
 };
 
-class MobilenetV2PostTrain : public DemoUnit {
+class CifarMobilenetV2Train : public DemoUnit {
 public:
     virtual int run(int argc, const char* argv[]) override {
-        if (argc < 6) {
-            std::cout << "usage: ./runTrainDemo.out MobilentV2PostTrain /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt"
-                      << std::endl;
+        if (argc < 5) {
+            std::cout << "usage: ./runTrainDemo.out CifarMobilenetV2Train path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" << std::endl;
             return 0;
         }
-
-        std::string trainImagesFolder = argv[2];
-        std::string trainImagesTxt = argv[3];
-        std::string testImagesFolder = argv[4];
-        std::string testImagesTxt = argv[5];
-
-        auto varMap = Variable::loadMap(argv[1]);
-        if (varMap.empty()) {
-            MNN_ERROR("Can not load model %s\n", argv[1]);
-            return 0;
+        MNNForwardType type = MNN_FORWARD_CPU;
+        if (argc >= 6) {
+            std::istringstream is(argv[5]);
+            int c;
+            is >> c;
+            type = (MNNForwardType)c;
         }
 
-        auto inputOutputs = Variable::getInputAndOutput(varMap);
-        auto inputs       = Variable::mapToSequence(inputOutputs.first);
-        auto outputs      = Variable::mapToSequence(inputOutputs.second);
-        std::shared_ptr<Module> model(NN::extract(inputs, outputs, true));
+        // global random number generator, should invoke before construct the model and dataset
+        RandomGenerator::generator(17);
+
+        std::string trainImagesFolder = argv[1];
+        std::string trainImagesTxt = argv[2];
+        std::string testImagesFolder = argv[3];
+        std::string testImagesTxt = argv[4];
 
-        MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
+        std::shared_ptr<Module> model(new MobilenetV2(10, 1.0f, 8, false));
 
+        MobilenetV2Utils::train(type, 4, model, 10, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt, 0, 32);
         return 0;
     }
 };
 
-class MobilenetV2TrainQuant : public DemoUnit {
+class MobilenetV2PostTrain : public DemoUnit {
 public:
     virtual int run(int argc, const char* argv[]) override {
         if (argc < 6) {
-            std::cout << "usage: ./runTrainDemo.out MobilentV2TrainQuant /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt [bits]"
+            std::cout << "usage: ./runTrainDemo.out MobilentV2PostTrain /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt"
                       << std::endl;
             return 0;
         }
+        MNNForwardType type = MNN_FORWARD_CPU;
+        if (argc >= 7) {
+            std::istringstream is(argv[6]);
+            int c;
+            is >> c;
+            type = (MNNForwardType)c;
+        }
 
         std::string trainImagesFolder = argv[2];
         std::string trainImagesTxt = argv[3];
@@ -147,24 +168,12 @@ class MobilenetV2TrainQuant : public DemoUnit {
             return 0;
         }
 
-        int bits = 8;
-        if (argc > 6) {
-            std::istringstream is(argv[6]);
-            is >> bits;
-        }
-        if (1 > bits || bits > 8) {
-            MNN_ERROR("bits must be 2-8, use 8 default\n");
-            bits = 8;
-        }
-
         auto inputOutputs = Variable::getInputAndOutput(varMap);
         auto inputs       = Variable::mapToSequence(inputOutputs.first);
         auto outputs      = Variable::mapToSequence(inputOutputs.second);
-
         std::shared_ptr<Module> model(NN::extract(inputs, outputs, true));
-        NN::turnQuantize(model.get(), bits);
 
-        MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
+        MobilenetV2Utils::train(type, 4, model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
 
         return 0;
     }
@@ -173,4 +182,4 @@ class MobilenetV2TrainQuant : public DemoUnit {
 DemoUnitSetRegister(MobilenetV2Transfer, "MobilenetV2Transfer");
 DemoUnitSetRegister(MobilenetV2Train, "MobilenetV2Train");
 DemoUnitSetRegister(MobilenetV2PostTrain, "MobilenetV2PostTrain");
-DemoUnitSetRegister(MobilenetV2TrainQuant, "MobilenetV2TrainQuant");
+DemoUnitSetRegister(CifarMobilenetV2Train, "CifarMobilenetV2Train");
diff --git a/tools/train/source/grad/BinaryGrad.cpp b/tools/train/source/grad/BinaryGrad.cpp
index 7ace5d6e3..7c54daa2b 100644
--- a/tools/train/source/grad/BinaryGrad.cpp
+++ b/tools/train/source/grad/BinaryGrad.cpp
@@ -9,8 +9,9 @@
 #include "BinaryGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
 using namespace MNN::Express;
+namespace MNN {
+
 class EltwiseGrad : public OpGrad {
 public:
     virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
@@ -193,10 +194,11 @@ class BinaryGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static BinaryGrad _c;
     OpGrad::insert((int)OpType_BinaryOp, &_c);
     static EltwiseGrad _d;
     OpGrad::insert((int)OpType_Eltwise, &_d);
-    return true;
-}();
+}
+REGISTER_GRAD(BinaryGrad, _create);
+};
diff --git a/tools/train/source/grad/BroadcastToGrad.cpp b/tools/train/source/grad/BroadcastToGrad.cpp
index df8f29a18..fb2828941 100644
--- a/tools/train/source/grad/BroadcastToGrad.cpp
+++ b/tools/train/source/grad/BroadcastToGrad.cpp
@@ -9,9 +9,8 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
 using namespace MNN::Express;
-
+namespace MNN {
 class BroadcastToGrad : public OpGrad {
 public:
     virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
@@ -70,8 +69,9 @@ class BroadcastToGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static BroadcastToGrad _c;
     OpGrad::insert(OpType_BroadcastTo, &_c);
-    return true;
-}();
+}
+REGISTER_GRAD(BroadcastToGrad, _create);
+};
diff --git a/tools/train/source/grad/ConcatGrad.cpp b/tools/train/source/grad/ConcatGrad.cpp
index 09fe384a3..0e6db8200 100644
--- a/tools/train/source/grad/ConcatGrad.cpp
+++ b/tools/train/source/grad/ConcatGrad.cpp
@@ -9,9 +9,8 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
 using namespace MNN::Express;
-
+namespace MNN {
 class ConcatGrad : public OpGrad {
 public:
     virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
@@ -34,8 +33,12 @@ class ConcatGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static ConcatGrad _c;
     OpGrad::insert((int)OpType_Concat, &_c);
-    return true;
-}();
+
+};
+REGISTER_GRAD(ConcatGrad, _create);
+
+}
+
diff --git a/tools/train/source/grad/ConvGrad.cpp b/tools/train/source/grad/ConvGrad.cpp
index ddb09ee39..1727ebc7a 100644
--- a/tools/train/source/grad/ConvGrad.cpp
+++ b/tools/train/source/grad/ConvGrad.cpp
@@ -10,7 +10,7 @@
 #include "core/Macro.h"
 using namespace std;
 using namespace MNN::Express;
-using namespace MNN;
+namespace MNN {
 class ConvGrad : public OpGrad {
 public:
     virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
@@ -54,7 +54,7 @@ class ConvGrad : public OpGrad {
                 auto sH = conv2D->common->strideY;
                 auto dW = conv2D->common->dilateX;
                 auto dH = conv2D->common->dilateY;
-
+                
                 std::vector<int> padding {0, 0, 0, 0};
                 int kernelWidthSize = dW * (kW - 1) + 1;
                 int kernelHeightSize = dH * (kH - 1) + 1;
@@ -80,7 +80,7 @@ class ConvGrad : public OpGrad {
             conv2D->common->inputCount  = outputCount;
             conv2D->common->outputCount = inputCount;
             newOp->main.value           = conv2D;
-
+            
             auto expr = Expr::create(std::move(newOp), {outputDiff, inputs[1]});
             res[0]    = Variable::create(expr);
             auto resultShape = res[0]->getInfo();
@@ -136,7 +136,7 @@ class DeconvGrad : public OpGrad {
             conv2D->common->inputCount  = outputCount;
             conv2D->common->outputCount = inputCount;
             newOp->main.value           = conv2D;
-
+            
             auto expr = Expr::create(std::move(newOp), {outputDiff, inputs[1]});
             res[0]    = Variable::create(expr);
         }
@@ -161,12 +161,14 @@ class DeconvGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static ConvGrad _c;
     OpGrad::insert(OpType_Convolution, &_c);
     OpGrad::insert(OpType_ConvolutionDepthwise, &_c);
     static DeconvGrad _d;
     OpGrad::insert(OpType_Deconvolution, &_d);
     OpGrad::insert(OpType_DeconvolutionDepthwise, &_d);
-    return true;
-}();
+};
+
+REGISTER_GRAD(ConvGrad, _create);
+};
diff --git a/tools/train/source/grad/GatherGrad.cpp b/tools/train/source/grad/GatherGrad.cpp
index 4d68b7f17..d80f69827 100644
--- a/tools/train/source/grad/GatherGrad.cpp
+++ b/tools/train/source/grad/GatherGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class GatherGrad : public OpGrad {
@@ -38,8 +38,12 @@ class GatherGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static GatherGrad _c;
     OpGrad::insert((int)OpType_GatherV2, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(GatherGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/GradOPRegister.cpp b/tools/train/source/grad/GradOPRegister.cpp
new file mode 100644
index 000000000..a8ddf5f6b
--- /dev/null
+++ b/tools/train/source/grad/GradOPRegister.cpp
@@ -0,0 +1,65 @@
+// This file is generated by Shell for ops register
+#include "OpGrad.hpp"
+namespace MNN {
+extern void ___TopKV2Grad_cpp___create__();
+extern void ___LoopGrad_cpp___create__();
+extern void ___SoftmaxGrad_cpp___create__();
+extern void ___GridSampleGrad_cpp___create__();
+extern void ___ReshapeGrad_cpp___create__();
+extern void ___ReluGrad_cpp___create__();
+extern void ___PoolGrad_cpp___create__();
+extern void ___GatherGrad_cpp___create__();
+extern void ___RoiPoolGrad_cpp___create__();
+extern void ___InterpGrad_cpp___create__();
+extern void ___RoiAlignGrad_cpp___create__();
+extern void ___MatMulGrad_cpp___create__();
+extern void ___RenderGrad_cpp___create__();
+extern void ___UnaryGrad_cpp___create__();
+extern void ___SeluGrad_cpp___create__();
+extern void ___SelectGrad_cpp___create__();
+extern void ___ZeroGrad_cpp___create__();
+extern void ___SliceGrad_cpp___create__();
+extern void ___ReduceGrad_cpp___create__();
+extern void ___ConcatGrad___create__();
+extern void ___BroadcastToGrad___create__();
+extern void ___BinaryGrad___create__();
+extern void ___TensorConvertGrad_cpp___create__();
+extern void ___RasterGrad_cpp___create__();
+extern void ___PermuteGrad_cpp___create__();
+extern void ___ConvGrad___create__();
+extern void ___StridedSliceGrad_cpp___create__();
+extern void ___MatrixBandPartGrad_cpp___create__();
+extern void ___ScaleGrad_cpp___create__();
+
+void registerGradOps() {
+___TopKV2Grad_cpp___create__();
+___LoopGrad_cpp___create__();
+___SoftmaxGrad_cpp___create__();
+___GridSampleGrad_cpp___create__();
+___ReshapeGrad_cpp___create__();
+___ReluGrad_cpp___create__();
+___PoolGrad_cpp___create__();
+___GatherGrad_cpp___create__();
+___RoiPoolGrad_cpp___create__();
+___InterpGrad_cpp___create__();
+___RoiAlignGrad_cpp___create__();
+___MatMulGrad_cpp___create__();
+___RenderGrad_cpp___create__();
+___UnaryGrad_cpp___create__();
+___SeluGrad_cpp___create__();
+___SelectGrad_cpp___create__();
+___ZeroGrad_cpp___create__();
+___SliceGrad_cpp___create__();
+___ReduceGrad_cpp___create__();
+___ConcatGrad___create__();
+___BroadcastToGrad___create__();
+___BinaryGrad___create__();
+___TensorConvertGrad_cpp___create__();
+___RasterGrad_cpp___create__();
+___PermuteGrad_cpp___create__();
+___ConvGrad___create__();
+___StridedSliceGrad_cpp___create__();
+___MatrixBandPartGrad_cpp___create__();
+___ScaleGrad_cpp___create__();
+}
+}
diff --git a/tools/train/source/grad/GridSampleGrad.cpp b/tools/train/source/grad/GridSampleGrad.cpp
index 28ccb2fc3..9ee844e74 100644
--- a/tools/train/source/grad/GridSampleGrad.cpp
+++ b/tools/train/source/grad/GridSampleGrad.cpp
@@ -9,7 +9,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class GridSampleGrad : public OpGrad {
@@ -198,9 +198,13 @@ class GridSampleGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static GridSampleGrad _c;
     OpGrad::insert((int)OpType_GridSample, &_c);
     OpGrad::insert((int)OpType_Texture, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(GridSampleGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/InterpGrad.cpp b/tools/train/source/grad/InterpGrad.cpp
index 8339164f9..451d485e7 100644
--- a/tools/train/source/grad/InterpGrad.cpp
+++ b/tools/train/source/grad/InterpGrad.cpp
@@ -12,7 +12,7 @@
 #include <math.h>
 
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 
@@ -326,9 +326,13 @@ class InterpGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static InterpGrad _c;
     OpGrad::insert((int)OpType_Interp, &_c);
     OpGrad::insert((int)OpType_Resize, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(InterpGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/LoopGrad.cpp b/tools/train/source/grad/LoopGrad.cpp
index 60363fb12..d3a514b65 100644
--- a/tools/train/source/grad/LoopGrad.cpp
+++ b/tools/train/source/grad/LoopGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class LoopGrad : public OpGrad {
@@ -396,8 +396,12 @@ class LoopGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static LoopGrad _c;
     OpGrad::insert(OpType_While, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(LoopGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/MatMulGrad.cpp b/tools/train/source/grad/MatMulGrad.cpp
index 3e93dc829..2d0e00958 100644
--- a/tools/train/source/grad/MatMulGrad.cpp
+++ b/tools/train/source/grad/MatMulGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "MatMulGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 class BatchMatMulGrad : public OpGrad {
 public:
@@ -221,10 +221,14 @@ class MatMulGrad : public OpGrad {
         return res;
     }
 };
-static const auto gRegister = []() {
+static void _create() {
     static MatMulGrad _c;
     OpGrad::insert(OpType_MatMul, &_c);
     static BatchMatMulGrad _d;
     OpGrad::insert(OpType_BatchMatMul, &_d);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(MatMulGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/MatrixBandPartGrad.cpp b/tools/train/source/grad/MatrixBandPartGrad.cpp
index a24fa6df6..29e53779d 100644
--- a/tools/train/source/grad/MatrixBandPartGrad.cpp
+++ b/tools/train/source/grad/MatrixBandPartGrad.cpp
@@ -9,7 +9,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class MatrixBandPartGrad : public OpGrad {
@@ -42,8 +42,12 @@ class MatrixBandPartGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static MatrixBandPartGrad _c;
     OpGrad::insert((int)OpType_MatrixBandPart, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(MatrixBandPartGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/OpGrad.cpp b/tools/train/source/grad/OpGrad.cpp
index 824aac318..644c47d16 100644
--- a/tools/train/source/grad/OpGrad.cpp
+++ b/tools/train/source/grad/OpGrad.cpp
@@ -6,11 +6,13 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include <mutex>
 #include "OpGrad.hpp"
 using namespace std;
 using namespace MNN::Express;
 //#define MNN_TRAIN_DEBUG
 namespace MNN {
+extern void registerGradOps();
 static std::map<int, OpGrad*>& getConverter() {
     static std::map<int, OpGrad*> gConverterMap;
     return gConverterMap;
@@ -69,6 +71,12 @@ Express::VARP OpGrad::divideAvoidZero(MNN::Express::VARP y, MNN::Express::VARP x
     p = MNN::Express::_Maximum(p, MNN::Express::_Scalar<float>(0.000001f));
     return MNN::Express::_Divide(y, p) * sx;
 }
+static std::once_flag gInit;
+void OpGrad::init() {
+    std::call_once(gInit, []() {
+        registerGradOps();
+    });
+}
 
 std::pair<std::vector<Express::VARP>, std::vector<Express::VARP>> OpGrad::gradCommon(std::vector<Express::VARP> outputs, std::vector<Express::VARP> outputDiff, std::vector<Express::VARP> parameters) {
     if (outputs.size() != outputDiff.size()) {
@@ -107,6 +115,7 @@ std::pair<std::vector<Express::VARP>, std::vector<Express::VARP>> OpGrad::gradCo
 }
 
 std::map<Express::VARP, Express::VARP> OpGrad::gradCommon(std::vector<Express::VARP> outputs, const std::set<Express::VARP>& parameters, std::map<EXPRP, std::vector<VARP>>& backwardMap, const std::vector<std::string> blockName) {
+    init();
     auto executeOrder = Variable::getExecuteOrder(outputs);
     for (auto iter = executeOrder.rbegin(); iter != executeOrder.rend(); iter++) {
         auto expr    = *iter;
diff --git a/tools/train/source/grad/OpGrad.hpp b/tools/train/source/grad/OpGrad.hpp
index 16a0e3e99..6230198ce 100644
--- a/tools/train/source/grad/OpGrad.hpp
+++ b/tools/train/source/grad/OpGrad.hpp
@@ -26,6 +26,7 @@ class MNN_PUBLIC OpGrad {
     Type type() const {
         return mType;
     }
+    static void init();
     static Express::VARP divideAvoidZero(MNN::Express::VARP y, MNN::Express::VARP x);
 
     virtual std::vector<Express::VARP> onGrad(Express::EXPRP expr,
@@ -42,6 +43,11 @@ class MNN_PUBLIC OpGrad {
 protected:
     Type mType = LINEAR;
 };
+#define REGISTER_GRAD(f, c)       \
+    extern void ___##f##__##c##__() { \
+        c();                          \
+    }
+
 } // namespace MNN
 
 #endif
diff --git a/tools/train/source/grad/PermuteGrad.cpp b/tools/train/source/grad/PermuteGrad.cpp
index f08e7a497..79726ebde 100644
--- a/tools/train/source/grad/PermuteGrad.cpp
+++ b/tools/train/source/grad/PermuteGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class TransposeGrad : public OpGrad {
@@ -67,10 +67,14 @@ class PermuteGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static PermuteGrad _c;
     OpGrad::insert((int)OpType_Permute, &_c);
     static TransposeGrad _d;
     OpGrad::insert((int)OpType_Transpose, &_d);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(PermuteGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/PoolGrad.cpp b/tools/train/source/grad/PoolGrad.cpp
index 829b959f8..57755f9a1 100644
--- a/tools/train/source/grad/PoolGrad.cpp
+++ b/tools/train/source/grad/PoolGrad.cpp
@@ -9,7 +9,7 @@
 #include "PoolGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class PoolGrad : public OpGrad {
@@ -35,8 +35,12 @@ class PoolGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static PoolGrad _c;
     OpGrad::insert(OpType_Pooling, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(PoolGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/RasterGrad.cpp b/tools/train/source/grad/RasterGrad.cpp
index bd4e75596..4d867ca75 100644
--- a/tools/train/source/grad/RasterGrad.cpp
+++ b/tools/train/source/grad/RasterGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/TensorUtils.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class RasterGrad : public OpGrad {
@@ -70,8 +70,12 @@ class RasterGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static RasterGrad _c;
     OpGrad::insert(OpType_Raster, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(RasterGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/ReduceGrad.cpp b/tools/train/source/grad/ReduceGrad.cpp
index a4b2e1fce..a4932ee61 100644
--- a/tools/train/source/grad/ReduceGrad.cpp
+++ b/tools/train/source/grad/ReduceGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class ReduceGrad : public OpGrad {
@@ -93,10 +93,14 @@ class FillGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static ReduceGrad _c;
     OpGrad::insert(OpType_Reduction, &_c);
     static FillGrad _d;
     OpGrad::insert(OpType_Fill, &_d);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(ReduceGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/ReluGrad.cpp b/tools/train/source/grad/ReluGrad.cpp
index f6779a836..b836fbbf2 100644
--- a/tools/train/source/grad/ReluGrad.cpp
+++ b/tools/train/source/grad/ReluGrad.cpp
@@ -10,7 +10,7 @@
 #include "core/Macro.h"
 #include <string.h>
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 class PReluGrad : public OpGrad {
 public:
@@ -83,12 +83,16 @@ class Relu6Grad : public OpGrad {
         return result;
     }
 };
-static const auto gRegister = []() {
+static void _create() {
     static ReluGrad _c;
     OpGrad::insert(OpType_ReLU, &_c);
     static Relu6Grad _d;
     OpGrad::insert(OpType_ReLU6, &_d);
     static PReluGrad _e;
     OpGrad::insert(OpType_PReLU, &_e);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(ReluGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/RenderGrad.cpp b/tools/train/source/grad/RenderGrad.cpp
index d663a593f..57ebe14df 100644
--- a/tools/train/source/grad/RenderGrad.cpp
+++ b/tools/train/source/grad/RenderGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class RasterDiffGrad : public OpGrad {
@@ -26,8 +26,12 @@ class RasterDiffGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static RasterDiffGrad _c;
     OpGrad::insert(OpType_RasterDiff, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(RenderGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/ReshapeGrad.cpp b/tools/train/source/grad/ReshapeGrad.cpp
index 3d886717e..ca1d59d54 100644
--- a/tools/train/source/grad/ReshapeGrad.cpp
+++ b/tools/train/source/grad/ReshapeGrad.cpp
@@ -9,7 +9,7 @@
 #include "ReshapeGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class ReshapeGrad : public OpGrad {
@@ -40,10 +40,14 @@ class ReshapeGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static ReshapeGrad _c;
     OpGrad::insert(OpType_Reshape, &_c);
     OpGrad::insert(OpType_Squeeze, &_c);
     OpGrad::insert(OpType_Unsqueeze, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(ReshapeGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/RoiAlignGrad.cpp b/tools/train/source/grad/RoiAlignGrad.cpp
index 4737f06be..4b5eee7be 100644
--- a/tools/train/source/grad/RoiAlignGrad.cpp
+++ b/tools/train/source/grad/RoiAlignGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class RoiAlignGrad : public OpGrad {
@@ -35,8 +35,12 @@ class RoiAlignGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static RoiAlignGrad _c;
     OpGrad::insert(OpType_ROIAlign, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(RoiAlignGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/RoiPoolGrad.cpp b/tools/train/source/grad/RoiPoolGrad.cpp
index d1e9ead37..f8577fd56 100644
--- a/tools/train/source/grad/RoiPoolGrad.cpp
+++ b/tools/train/source/grad/RoiPoolGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class RoiPoolGrad : public OpGrad {
@@ -32,8 +32,12 @@ class RoiPoolGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static RoiPoolGrad _c;
     OpGrad::insert(OpType_ROIPooling, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(RoiPoolGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/ScaleGrad.cpp b/tools/train/source/grad/ScaleGrad.cpp
index 748fe5f13..dd5f0dc51 100644
--- a/tools/train/source/grad/ScaleGrad.cpp
+++ b/tools/train/source/grad/ScaleGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class ScaleGrad : public OpGrad {
@@ -32,8 +32,12 @@ class ScaleGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static ScaleGrad _c;
     OpGrad::insert(OpType_Scale, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(ScaleGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/SelectGrad.cpp b/tools/train/source/grad/SelectGrad.cpp
index 4d63a8647..77ae0f456 100644
--- a/tools/train/source/grad/SelectGrad.cpp
+++ b/tools/train/source/grad/SelectGrad.cpp
@@ -9,7 +9,7 @@
 #include "SelectGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class SelectGrad : public OpGrad {
@@ -36,8 +36,12 @@ class SelectGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static SelectGrad _c;
     OpGrad::insert(OpType_Select, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(SelectGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/SeluGrad.cpp b/tools/train/source/grad/SeluGrad.cpp
index 45a930351..4fae06bd8 100644
--- a/tools/train/source/grad/SeluGrad.cpp
+++ b/tools/train/source/grad/SeluGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class SeluGrad : public OpGrad {
@@ -38,8 +38,12 @@ class SeluGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static SeluGrad _c;
     OpGrad::insert(OpType_Selu, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(SeluGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/SliceGrad.cpp b/tools/train/source/grad/SliceGrad.cpp
index 6cc168413..3f1fbc7ad 100644
--- a/tools/train/source/grad/SliceGrad.cpp
+++ b/tools/train/source/grad/SliceGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "OpGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class SliceGrad : public OpGrad {
@@ -39,8 +39,12 @@ class SliceGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static SliceGrad _c;
     OpGrad::insert((int)OpType_Slice, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(SliceGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/SoftmaxGrad.cpp b/tools/train/source/grad/SoftmaxGrad.cpp
index 6e7d3ef78..6d144560c 100644
--- a/tools/train/source/grad/SoftmaxGrad.cpp
+++ b/tools/train/source/grad/SoftmaxGrad.cpp
@@ -10,7 +10,7 @@
 #include "core/Macro.h"
 #include <MNN/expr/ExprCreator.hpp>
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class SoftmaxGrad : public OpGrad {
@@ -45,8 +45,12 @@ class SoftmaxGrad : public OpGrad {
         return {inputGrad};
     }
 };
-static const auto gRegister = []() {
+static void _create() {
     static SoftmaxGrad _c;
     OpGrad::insert(OpType_Softmax, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(SoftmaxGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/StridedSliceGrad.cpp b/tools/train/source/grad/StridedSliceGrad.cpp
index 364567e09..90fc17612 100644
--- a/tools/train/source/grad/StridedSliceGrad.cpp
+++ b/tools/train/source/grad/StridedSliceGrad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class StridedSliceGrad : public OpGrad {
@@ -39,8 +39,12 @@ class StridedSliceGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static StridedSliceGrad _c;
     OpGrad::insert(OpType_StridedSlice, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(StridedSliceGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/TensorConvertGrad.cpp b/tools/train/source/grad/TensorConvertGrad.cpp
index 3142aa610..ea0da919b 100644
--- a/tools/train/source/grad/TensorConvertGrad.cpp
+++ b/tools/train/source/grad/TensorConvertGrad.cpp
@@ -8,7 +8,7 @@
 
 #include "TensorConvertGrad.hpp"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class TensorConvertGrad : public OpGrad {
@@ -22,8 +22,12 @@ class TensorConvertGrad : public OpGrad {
         return result;
     }
 };
-static const auto gRegister = []() {
+static void _create() {
     static TensorConvertGrad _c;
     OpGrad::insert(OpType_ConvertTensor, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(TensorConvertGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/TopKV2Grad.cpp b/tools/train/source/grad/TopKV2Grad.cpp
index 65d4fadc7..b996dd5f4 100644
--- a/tools/train/source/grad/TopKV2Grad.cpp
+++ b/tools/train/source/grad/TopKV2Grad.cpp
@@ -9,7 +9,7 @@
 #include "OpGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class TopKV2Grad : public OpGrad {
@@ -30,8 +30,12 @@ class TopKV2Grad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static TopKV2Grad _c;
     OpGrad::insert(OpType_TopKV2, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(TopKV2Grad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/UnaryGrad.cpp b/tools/train/source/grad/UnaryGrad.cpp
index f213e0c9e..4eb266a8c 100644
--- a/tools/train/source/grad/UnaryGrad.cpp
+++ b/tools/train/source/grad/UnaryGrad.cpp
@@ -12,7 +12,7 @@
 #define MNN_PI 3.14159265358979323846
 
 using namespace std;
-using namespace MNN;
+namespace MNN {
 using namespace MNN::Express;
 
 class UnaryGrad : public OpGrad {
@@ -221,12 +221,16 @@ class TanhGrad : public OpGrad {
     }
 };
 
-static const auto gRegister = []() {
+static void _create() {
     static UnaryGrad _c;
     static SigmoidGrad _s;
     static TanhGrad _t;
     OpGrad::insert(OpType_UnaryOp, &_c);
     OpGrad::insert(OpType_Sigmoid, &_s);
     OpGrad::insert(OpType_TanH, &_t);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(UnaryGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/grad/ZeroGrad.cpp b/tools/train/source/grad/ZeroGrad.cpp
index e77f1866d..5b0e251f1 100644
--- a/tools/train/source/grad/ZeroGrad.cpp
+++ b/tools/train/source/grad/ZeroGrad.cpp
@@ -9,7 +9,7 @@
 #include "ReluGrad.hpp"
 #include "core/Macro.h"
 using namespace std;
-using namespace MNN;
+namespace MNN {
 
 class ZeroGrad : public OpGrad {
 public:
@@ -23,8 +23,12 @@ class ZeroGrad : public OpGrad {
         return result;
     }
 };
-static const auto gRegister = []() {
+static void _create() {
     static ZeroGrad _c;
     OpGrad::insert(OpType_ZeroGrad, &_c);
-    return true;
-}();
+
+}
+
+REGISTER_GRAD(ZeroGrad_cpp, _create);
+};
+
diff --git a/tools/train/source/models/MobilenetV2.cpp b/tools/train/source/models/MobilenetV2.cpp
index fe3daef0c..370f45b62 100644
--- a/tools/train/source/models/MobilenetV2.cpp
+++ b/tools/train/source/models/MobilenetV2.cpp
@@ -15,7 +15,7 @@ namespace Model {
 using namespace MNN::Express;
 class _ConvBnRelu : public Module {
 public:
-    _ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize = 3, int stride = 1, bool depthwise = false);
+    _ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize = 3, int stride = 1, bool depthwise = false, bool useBn = true);
 
     virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP> &inputs) override;
 
@@ -24,13 +24,13 @@ class _ConvBnRelu : public Module {
 };
 
 std::shared_ptr<Module> ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize = 3, int stride = 1,
-                                   bool depthwise = false) {
-    return std::shared_ptr<Module>(new _ConvBnRelu(inputOutputChannels, kernelSize, stride, depthwise));
+                                   bool depthwise = false, bool useBn = true) {
+    return std::shared_ptr<Module>(new _ConvBnRelu(inputOutputChannels, kernelSize, stride, depthwise, useBn));
 }
 
 class _BottleNeck : public Module {
 public:
-    _BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio);
+    _BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio, bool useBn = true);
 
     virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP> &inputs) override;
 
@@ -38,11 +38,11 @@ class _BottleNeck : public Module {
     bool useShortcut = false;
 };
 
-std::shared_ptr<Module> BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio) {
-    return std::shared_ptr<Module>(new _BottleNeck(inputOutputChannels, stride, expandRatio));
+std::shared_ptr<Module> BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio, bool useBn) {
+    return std::shared_ptr<Module>(new _BottleNeck(inputOutputChannels, stride, expandRatio, useBn));
 }
 
-_ConvBnRelu::_ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize, int stride, bool depthwise) {
+_ConvBnRelu::_ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize, int stride, bool depthwise, bool useBn) {
     int inputChannels = inputOutputChannels[0], outputChannels = inputOutputChannels[1];
 
     NN::ConvOption convOption;
@@ -53,9 +53,12 @@ _ConvBnRelu::_ConvBnRelu(std::vector<int> inputOutputChannels, int kernelSize, i
     convOption.depthwise  = depthwise;
     conv.reset(NN::Conv(convOption, false, std::shared_ptr<Initializer>(Initializer::MSRA())));
 
-    bn.reset(NN::BatchNorm(outputChannels));
-
-    registerModel({conv, bn});
+    if (useBn) {
+        bn.reset(NN::BatchNorm(outputChannels));
+        registerModel({conv, bn});
+    } else {
+        registerModel({conv});
+    }
 }
 
 std::vector<Express::VARP> _ConvBnRelu::onForward(const std::vector<Express::VARP> &inputs) {
@@ -63,13 +66,15 @@ std::vector<Express::VARP> _ConvBnRelu::onForward(const std::vector<Express::VAR
     VARP x = inputs[0];
 
     x = conv->forward(x);
-    x = bn->forward(x);
+    if (nullptr != bn.get()) {
+        x = bn->forward(x);
+    }
     x = _Relu6(x);
 
     return {x};
 }
 
-_BottleNeck::_BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio) {
+_BottleNeck::_BottleNeck(std::vector<int> inputOutputChannels, int stride, int expandRatio, bool useBn) {
     int inputChannels = inputOutputChannels[0], outputChannels = inputOutputChannels[1];
     int expandChannels = inputChannels * expandRatio;
 
@@ -78,10 +83,10 @@ _BottleNeck::_BottleNeck(std::vector<int> inputOutputChannels, int stride, int e
     }
 
     if (expandRatio != 1) {
-        layers.emplace_back(ConvBnRelu({inputChannels, expandChannels}, 1));
+        layers.emplace_back(ConvBnRelu({inputChannels, expandChannels}, 1, 1, false, useBn));
     }
 
-    layers.emplace_back(ConvBnRelu({expandChannels, expandChannels}, 3, stride, true));
+    layers.emplace_back(ConvBnRelu({expandChannels, expandChannels}, 3, stride, true, useBn));
 
     NN::ConvOption convOption;
     convOption.kernelSize = {1, 1};
@@ -91,7 +96,9 @@ _BottleNeck::_BottleNeck(std::vector<int> inputOutputChannels, int stride, int e
     convOption.depthwise  = false;
     layers.emplace_back(NN::Conv(convOption, false, std::shared_ptr<Initializer>(Initializer::MSRA())));
 
-    layers.emplace_back(NN::BatchNorm(outputChannels));
+    if (useBn) {
+        layers.emplace_back(NN::BatchNorm(outputChannels));
+    }
 
     registerModel(layers);
 }
@@ -111,7 +118,7 @@ std::vector<Express::VARP> _BottleNeck::onForward(const std::vector<Express::VAR
     return {x};
 }
 
-MobilenetV2::MobilenetV2(int numClasses, float widthMult, int divisor) {
+MobilenetV2::MobilenetV2(int numClasses, float widthMult, int divisor, bool useBn) {
     int inputChannels = 32;
     int lastChannels  = 1280;
 
@@ -127,7 +134,7 @@ MobilenetV2::MobilenetV2(int numClasses, float widthMult, int divisor) {
     inputChannels = makeDivisible(inputChannels * widthMult, divisor);
     lastChannels  = makeDivisible(lastChannels * std::max(1.0f, widthMult), divisor);
 
-    firstConv = ConvBnRelu({3, inputChannels}, 3, 2);
+    firstConv = ConvBnRelu({3, inputChannels}, 3, 2, false, useBn);
 
     for (int i = 0; i < invertedResidualSetting.size(); i++) {
         std::vector<int> setting = invertedResidualSetting[i];
@@ -144,12 +151,12 @@ MobilenetV2::MobilenetV2(int numClasses, float widthMult, int divisor) {
                 stride = s;
             }
 
-            bottleNeckBlocks.emplace_back(BottleNeck({inputChannels, outputChannels}, stride, t));
+            bottleNeckBlocks.emplace_back(BottleNeck({inputChannels, outputChannels}, stride, t, useBn));
             inputChannels = outputChannels;
         }
     }
 
-    lastConv = ConvBnRelu({inputChannels, lastChannels}, 1);
+    lastConv = ConvBnRelu({inputChannels, lastChannels}, 1, 1, false, useBn);
 
     dropout.reset(NN::Dropout(0.1));
     fc.reset(NN::Linear(lastChannels, numClasses, true, std::shared_ptr<Initializer>(Initializer::MSRA())));
diff --git a/tools/train/source/models/MobilenetV2.hpp b/tools/train/source/models/MobilenetV2.hpp
index 88e95e749..c5c4c9906 100644
--- a/tools/train/source/models/MobilenetV2.hpp
+++ b/tools/train/source/models/MobilenetV2.hpp
@@ -24,7 +24,7 @@ class MNN_PUBLIC MobilenetV2 : public Express::Module {
 public:
     // use tensorflow numClasses = 1001, which label 0 means outlier of the original 1000 classes
     // so you maybe need to add 1 to your true labels, if you are testing with ImageNet dataset
-    MobilenetV2(int numClasses = 1001, float widthMult = 1.0f, int divisor = 8);
+    MobilenetV2(int numClasses = 1001, float widthMult = 1.0f, int divisor = 8, bool useBn = true);
 
     virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP> &inputs) override;
 
diff --git a/transformers/diffusion/main.cpp b/transformers/diffusion/main.cpp
index 946175e34..75abd8924 100644
--- a/transformers/diffusion/main.cpp
+++ b/transformers/diffusion/main.cpp
@@ -2,36 +2,46 @@
 #include "pipeline.hpp"
 
 int main(int argc, const char* argv[]) {
-    if (argc < 3) {
-        MNN_PRINT("Usage: ./diffusion_demo <resource_path> <model_type> <output_image_name> <input_text>\n");
+    if (argc < 7) {
+        MNN_PRINT("=====================================================================================================================\n");
+        MNN_PRINT("Usage: ./diffusion_demo <resource_path> <model_type> <output_image_name> <memory_mode> <backend_type> <input_text>\n");
+        MNN_PRINT("=====================================================================================================================\n");
         return 0;
     }
 
     auto resource_path = argv[1];
     auto model_type = (diffusion::DiffusionModelType)atoi(argv[2]);
     auto img_name = argv[3];
-    
+    auto memory_mode = atoi(argv[4]);
+    auto backend_type = (MNNForwardType)atoi(argv[5]);
     std::string input_text;
-    for (int i = 4; i < argc; ++i) {
+    for (int i = 6; i < argc; ++i) {
         input_text += argv[i];
         if (i < argc - 1) {
             input_text += " ";
         }
     }
     
-    MNN_PRINT("model resource path: %s\n", resource_path);
+    MNN_PRINT("Model resource path: %s\n", resource_path);
     if(model_type == diffusion::STABLE_DIFFUSION_1_5) {
-        MNN_PRINT("model type is stable diffusion 1.5\n");
+        MNN_PRINT("Model type is stable diffusion 1.5\n");
     } else if (model_type == diffusion::STABLE_DIFFUSION_TAIYI_CHINESE) {
-        MNN_PRINT("model type is stable diffusion taiyi chinese version\n");
+        MNN_PRINT("Model type is stable diffusion taiyi chinese version\n");
+    } else {
+        MNN_PRINT("Error: Model type %d not supported, please check\n", (int)model_type);
+    }
+
+    if(memory_mode == 0) {
+	    MNN_PRINT("(Memory Lack) Each diffusion model will be initilized when using, freed after using.\n");
     } else {
-        MNN_PRINT("model type: %d not supported, please check\n", (int)model_type);
+	    MNN_PRINT("(Memory Enough) All Diffusion models will be initilized when application enter.\n");
     }
-    MNN_PRINT("output img_name: %s\n", img_name);
-    MNN_PRINT("input texts: %s\n", input_text.c_str());
+    MNN_PRINT("Backend type: %d\n", (int)backend_type);
+    MNN_PRINT("Output image name: %s\n", img_name);
+    MNN_PRINT("Prompt text: %s\n", input_text.c_str());
 
     
-    diffusion::Pipeline pipeline(resource_path, model_type);
+    diffusion::Pipeline pipeline(resource_path, model_type, backend_type, memory_mode);
     pipeline.run(input_text, img_name);
     return 0;
 }
diff --git a/transformers/diffusion/pipeline.cpp b/transformers/diffusion/pipeline.cpp
index 194ebebd2..ed35ba705 100644
--- a/transformers/diffusion/pipeline.cpp
+++ b/transformers/diffusion/pipeline.cpp
@@ -24,23 +24,6 @@ using namespace CV;
 
 namespace diffusion {
 
-static inline int64_t getTime() {
-    uint64_t time;
-#if defined(_MSC_VER)
-    LARGE_INTEGER now, freq;
-    QueryPerformanceCounter(&now);
-    QueryPerformanceFrequency(&freq);
-    uint64_t sec = now.QuadPart / freq.QuadPart;
-    uint64_t usec = (now.QuadPart % freq.QuadPart) * 1000000 / freq.QuadPart;
-    time = sec * 1000000 + usec;
-#else
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    time = static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#endif
-    return time;
-}
-
 void display_progress(int cur, int total){
     putchar('\r');
     MNN_PRINT("[");
@@ -52,7 +35,8 @@ void display_progress(int cur, int total){
     fflush(stdout);
 }
 
-Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType) : mModelPath(modelPath), mModelType(modelType) {
+Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode) : 
+    mModelPath(modelPath), mModelType(modelType), mBackendType(backendType), mMemoryMode(memoryMode) {
     if(modelType == STABLE_DIFFUSION_1_5) {
         mMaxTextLen = 77;
     } else if(modelType == diffusion::STABLE_DIFFUSION_TAIYI_CHINESE) {
@@ -86,17 +70,26 @@ Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType) : mModel
      };
 }
 
-bool Pipeline::load_modules(std::string modelPath) {
+bool Pipeline::load_modules() {
     AUTOTIME;
     ScheduleConfig config;
     BackendConfig backendConfig;
-//    config.type          = MNN_FORWARD_CPU;
-    config.type          = MNN_FORWARD_OPENCL;
-    config.mode     = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_FAST;
+    config.type = mBackendType;
+    if(config.type == MNN_FORWARD_CPU) {
+        backendConfig.memory = BackendConfig::Memory_Low;
+        config.numThread = 4;
+    } else if(config.type == MNN_FORWARD_OPENCL) {
+        config.mode = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_FAST;
+    } else {
+        config.numThread = 1;
+    }
+
     backendConfig.precision = BackendConfig::Precision_Low;
-    backendConfig.memory = BackendConfig::Memory_Low;
     config.backendConfig = &backendConfig;
 
+    auto exe = ExecutorScope::Current();
+    exe->lazyEval = false;
+    exe->setGlobalExecutorConfig(config.type, backendConfig, config.numThread);
 
     Module::Config module_config;
     module_config.shapeMutable = false;
@@ -113,48 +106,47 @@ bool Pipeline::load_modules(std::string modelPath) {
     mTimestepVar = _Input({1}, NCHW, halide_type_of<int>());
     mSampleVar = _Concat({mLatentVar, mLatentVar}, 0);
     
-    MNN_PRINT("Model loading and initilizing...\n");
     MNN_PRINT("First time initilizing may cost a few seconds to create cachefile, please wait ...\n");
 
     VARP text_embeddings;
     mModules.resize(3);
     // load text_encoder model
     {
-        std::string model_path = modelPath + "/text_encoder.mnn";
+        std::string model_path = mModelPath + "/text_encoder.mnn";
         mModules[0].reset(Module::load(
             {"input_ids"}, {"last_hidden_state", "pooler_output"}, model_path.c_str(), runtime_manager_, &module_config));
         
-        auto outputs = mModules[0]->onForward({mPromptVar});
-        text_embeddings = _Convert(outputs[0], NCHW);
-
+	if(mMemoryMode > 0) { 
+            auto outputs = mModules[0]->onForward({mPromptVar});
+            text_embeddings = _Convert(outputs[0], NCHW);
+	}
         display_progress(1, 3);
     }
     // load unet model
     {
-        std::string model_path = modelPath + "/unet.mnn";
+        std::string model_path = mModelPath + "/unet.mnn";
         mModules[1].reset(Module::load(
             {"sample", "timestep", "encoder_hidden_states"}, {"out_sample"}, model_path.c_str(), runtime_manager_, &module_config));
         
-        auto outputs = mModules[1]->onForward({mSampleVar, mTimestepVar, text_embeddings});
-
-        auto output = _Convert(outputs[0], NCHW);
+	if(mMemoryMode > 0) {
+            auto outputs = mModules[1]->onForward({mSampleVar, mTimestepVar, text_embeddings});
+            auto output = _Convert(outputs[0], NCHW);
+	}
         display_progress(2, 3);
     }
     // load vae_decoder model
     {
-        std::string model_path = modelPath + "/vae_decoder.mnn";
+        std::string model_path = mModelPath + "/vae_decoder.mnn";
         mModules[2].reset(Module::load(
             {"latent_sample"}, {"sample"}, model_path.c_str(), runtime_manager_, &module_config));
-
+        
+	if(mMemoryMode > 0) {
         auto outputs = mModules[2]->onForward({mLatentVar});
         auto output = _Convert(outputs[0], NCHW);
-        display_progress(3, 3);
+	}
+	display_progress(3, 3);
     }
 
-    auto exe = ExecutorScope::Current();
-    exe->lazyEval = false;
-    exe->setGlobalExecutorConfig(config.type, backendConfig, config.numThread);
-
     return true;
 }
 
@@ -321,7 +313,7 @@ VARP Pipeline::vae_decoder(VARP latent) {
     return image;
 }
 
-bool Pipeline::run(const std::string& sentence, const std::string& img_name) {
+bool Pipeline::run(const std::string& prompt, const std::string& imagePath) {
     std::unique_ptr<diffusion::Tokenizer> tok;
     if(mModelType == STABLE_DIFFUSION_1_5) {
         tok.reset(new diffusion::CLIPTokenizer);
@@ -329,18 +321,18 @@ bool Pipeline::run(const std::string& sentence, const std::string& img_name) {
         tok.reset(new diffusion::BertTokenizer);
     }
     tok->load(mModelPath);
-    load_modules(mModelPath);
+    load_modules();
     
     AUTOTIME;
-    auto ids = tok->encode(sentence, mMaxTextLen);
+    auto ids = tok->encode(prompt, mMaxTextLen);
     auto text_embeddings = text_encoder(ids);
     
     auto latent = unet(text_embeddings);
 
     auto image = vae_decoder(latent);
-    bool res = imwrite(img_name, image);
+    bool res = imwrite(imagePath, image);
     if (res) {
-        MNN_PRINT("SUCCESS! write to %s\n", img_name.c_str());
+        MNN_PRINT("SUCCESS! write generated image to %s\n", imagePath.c_str());
     }
     return true;
 }
diff --git a/transformers/diffusion/pipeline.hpp b/transformers/diffusion/pipeline.hpp
index 2d8cc2811..3ce1e95f0 100644
--- a/transformers/diffusion/pipeline.hpp
+++ b/transformers/diffusion/pipeline.hpp
@@ -19,11 +19,11 @@ typedef enum {
     
 class Pipeline {
 public:
-    Pipeline(std::string modelPath, DiffusionModelType modelType);
+    Pipeline(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode);
     ~Pipeline() = default;
-    bool run(const std::string& sentence, const std::string& img_name);
+    bool run(const std::string& prompt, const std::string& imagePath);
 private:
-    bool load_modules(std::string modelPath);
+    bool load_modules();
     VARP step_plms(VARP sample, VARP model_output, int index);
     VARP text_encoder(const std::vector<int>& ids);
     VARP unet(VARP text_embeddings);
@@ -31,16 +31,19 @@ class Pipeline {
 private:
     std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
     std::vector<std::shared_ptr<Module>> mModules;
-
-    std::string mModelPath;
-    DiffusionModelType mModelType;
-    int mMaxTextLen = 77;
     // step_plms
     std::vector<int> mTimeSteps;
     std::vector<float> mAlphas;
     std::vector<VARP> mEts;
     VARP mSample;
     VARP mLatentVar, mPromptVar, mTimestepVar, mSampleVar;
+
+private:
+    std::string mModelPath;
+    DiffusionModelType mModelType;
+    int mMaxTextLen = 77;
+    int mMemoryMode;
+    MNNForwardType mBackendType;
 };
 
 }
diff --git a/transformers/llm/config.json b/transformers/llm/config.json
index 7025fad4b..f34f70063 100755
--- a/transformers/llm/config.json
+++ b/transformers/llm/config.json
@@ -5,5 +5,11 @@
     "backend_type": "cpu",
     "thread_num": 4,
     "precision": "low",
-    "memory": "low"
-}
+    "memory": "low",
+
+    "is_batch_quant": 1,
+    
+    "reuse_kv": false,
+    "quant_kv": 0,
+    "kvcache_limit": -1
+}
\ No newline at end of file
diff --git a/transformers/llm/engine/CMakeLists.txt b/transformers/llm/engine/CMakeLists.txt
index 767cc272e..2601e4d92 100644
--- a/transformers/llm/engine/CMakeLists.txt
+++ b/transformers/llm/engine/CMakeLists.txt
@@ -4,24 +4,22 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
 # source files
 FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 
-if (MSVC)
-    # compile static lib, surrpot Winwows
-    add_library(llm STATIC ${SRCS})
-    target_link_libraries(llm ${MNN_DEPS})
-else()
-    if (MNN_SEP_BUILD)
-        if (MNN_BUILD_SHARED_LIBS)
-            # compile dynamic so, support Linux/Mac
-            add_library(llm SHARED ${SRCS})
-            set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-            target_link_libraries(llm ${MNN_DEPS})
-        else()
-            add_library(llm STATIC ${SRCS})
-        endif()
+if (MNN_SEP_BUILD)
+    if (MNN_BUILD_SHARED_LIBS)
+        # compile dynamic so, support Linux/Mac
+        add_library(llm SHARED ${SRCS})
+        set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+        target_link_libraries(llm ${MNN_DEPS})
     else()
-        add_library(llm OBJECT ${SRCS})
+        add_library(llm STATIC ${SRCS})
     endif()
+else()
+    add_library(llm OBJECT ${SRCS})
 endif()
 
 add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/llm_demo.cpp)
-target_link_libraries(llm_demo llm)
\ No newline at end of file
+IF (NOT MNN_SEP_BUILD)
+    target_link_libraries(llm_demo ${MNN_DEPS})
+ELSE ()
+    target_link_libraries(llm_demo ${MNN_DEPS} llm)
+ENDIF ()
\ No newline at end of file
diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
index 4e1f445df..a4b868592 100644
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@@ -70,6 +70,11 @@ class MNN_PUBLIC Llm {
     // config function
     std::string dump_config();
     bool set_config(const std::string& content);
+    // lora function
+    size_t apply_lora(const std::string& lora_path);
+    Llm* create_lora(const std::string& lora_path);
+    bool release_module(size_t index);
+    bool select_module(size_t index);
     friend class Pipeline;
 public:
     // forward info
@@ -89,8 +94,8 @@ class MNN_PUBLIC Llm {
     MNN::Express::VARP inputs_embeds_, attention_mask_, position_ids_;
     std::shared_ptr<MNN::Express::Executor::RuntimeManager> runtime_manager_;
     std::vector<std::shared_ptr<MNN::Express::Module>> modules_;
-    std::vector<std::shared_ptr<MNN::Express::Module>> decode_modules_;
-    std::vector<std::shared_ptr<MNN::Express::Module>> prefill_modules_;
+    std::vector<std::shared_ptr<MNN::Express::Module>> prefill_modules_, decode_modules_, current_modules_;
+    const MNN::Express::Module* base_module_ = nullptr;
     void init_runtime();
     std::string decode(int id);
     bool is_stop(int token_id);
@@ -98,6 +103,8 @@ class MNN_PUBLIC Llm {
     virtual MNN::Express::VARP embedding(const std::vector<int>& input_ids);
     virtual MNN::Express::VARP gen_attention_mask(int seq_len);
     virtual MNN::Express::VARP gen_position_ids(int seq_len);
+    bool mTracing = false;
+
 };
 
 // Embedding start
diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp
index 416154f84..3e41b2eb0 100644
--- a/transformers/llm/engine/llm_demo.cpp
+++ b/transformers/llm/engine/llm_demo.cpp
@@ -14,23 +14,9 @@
 using namespace MNN::Transformer;
 static void trace_prepare(Llm* llm) {
     MNN_PRINT("Prepare for resize opt Begin\n");
-    std::vector<std::string> prompts = {
-        "Hello",
-    };
     llm->trace(true);
-    int prompt_len = 0;
-    int decode_len = 0;
-    int64_t prefill_time = 0;
-    int64_t decode_time = 0;
-    // llm->warmup();
-    for (int i = 0; i < prompts.size(); i++) {
-        std::ostringstream cacheOs;
-        llm->response(prompts[i], &cacheOs);
-        prompt_len += llm->prompt_len_;
-        decode_len += llm->gen_seq_len_;
-        prefill_time += llm->prefill_us_;
-        decode_time += llm->decode_us_;
-    }
+    std::ostringstream cacheOs;
+    llm->response("Hello", &cacheOs);
     MNN_PRINT("Prepare for resize opt End\n");
     llm->trace(false);
 }
@@ -181,7 +167,7 @@ int main(int argc, const char* argv[]) {
         AUTOTIME;
         llm->load();
     }
-    if (true) {
+    if (false) {
         AUTOTIME;
         trace_prepare(llm.get());
     }
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index 4ed60d9c2..efa9a5d23 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -97,8 +97,10 @@ void Llm::init_runtime() {
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
     runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
     runtime_manager_->setHint(MNN::Interpreter::DYNAMIC_QUANT_OPTIONS, 1); // 1: per batch quant, 2: per tensor quant
-    runtime_manager_->setHint(MNN::Interpreter::KVCACHE_QUANT_OPTIONS, config_->quant_kv()); // 0: no quant, 1: quant key, 2: quant value, 3: quant kv
-
+    runtime_manager_->setHint(MNN::Interpreter::KVCACHE_QUANT_OPTIONS, config_->quant_kv());
+    runtime_manager_->setHint(MNN::Interpreter::KVCACHE_SIZE_LIMIT, config_->kvcache_limit());
+    runtime_manager_->setExternalPath("/tmp/.kvcache", MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR);
+    
 #if DEBUG_MODE==1
     runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
     _initTimeTrace();
@@ -130,6 +132,10 @@ void Llm::load() {
     Module::Config module_config;
     module_config.shapeMutable = true;
     module_config.rearrange = true;
+    // using base module for lora module
+    if (base_module_ != nullptr) {
+        module_config.base = base_module_;
+    }
     int layer_nums = config_->layer_nums();
     if (is_single_) {
         // load single model
@@ -164,6 +170,54 @@ void Llm::load() {
     prefill_modules_ = modules_;
 }
 
+size_t Llm::apply_lora(const std::string& lora_path) {
+    std::string model_path = config_->base_dir_ + "/" + lora_path;
+    Module::Config module_config;
+    module_config.shapeMutable = true;
+    module_config.rearrange = true;
+    module_config.base = modules_.begin()->get();
+    size_t lora_index = modules_.size();
+    modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"},
+                                       {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+    select_module(lora_index);
+    return lora_index;
+}
+
+Llm* Llm::create_lora(const std::string& lora_path) {
+    auto llm = new Llm(config_);
+    llm->set_config("{\"llm_model\": \"" + lora_path + "\"}");
+    llm->base_module_ = modules_.begin()->get();
+    llm->load();
+    return llm;
+}
+
+bool Llm::release_module(size_t index) {
+    if (index >= modules_.size()) {
+        return false;
+    }
+    if (prefill_modules_[0] == modules_[index]) {
+        select_module(0);
+    }
+    modules_[index].reset();
+    return true;
+}
+
+bool Llm::select_module(size_t index) {
+    if (index >= modules_.size()) {
+        return false;
+    }
+    if (modules_[index] == nullptr) {
+        return false;
+    }
+    if (decode_modules_.empty()) {
+        decode_modules_.resize(modules_.size());
+        prefill_modules_.resize(modules_.size());
+    }
+    decode_modules_[0].reset(Module::clone(modules_[index].get()));
+    prefill_modules_[0] = modules_[index];
+    return true;
+}
+
 void Llm::trace(bool start) {
     auto status = MNN::Interpreter::Session_Resize_Check;
     if (start) {
@@ -175,6 +229,7 @@ void Llm::trace(bool start) {
         m->traceOrOptimize(status);
     }
     runtime_manager_->updateCache();
+    mTracing = start;
 }
 
 VARP Llm::forward(const std::vector<int>& input_ids) {
@@ -185,11 +240,10 @@ VARP Llm::forward(const std::vector<int>& input_ids) {
     if (is_single_) {
         // single model
         auto hidden_states = embedding(input_ids);
-        auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
+        auto outputs = current_modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]});
         if (outputs.empty()) {
             return nullptr;
         }
-        ExecutorScope::Current()->gc(Executor::FULL);
         logits = outputs[0];
         past_key_values_[0] = outputs[1];
     } else {
@@ -199,14 +253,13 @@ VARP Llm::forward(const std::vector<int>& input_ids) {
         ExecutorScope::Current()->gc(Executor::FULL);
         for (int i = 0; i < layer_nums; i++) {
             AUTOTIME;
-            auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
+            auto outputs = current_modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
             hidden_states = outputs[0];
             past_key_values_[i] = outputs[1];
         }
-        ExecutorScope::Current()->gc(Executor::FULL);
         {
             AUTOTIME;
-            auto outputs = modules_[layer_nums]->onForward({hidden_states});
+            auto outputs = current_modules_[layer_nums]->onForward({hidden_states});
             logits = outputs[0];
         }
     }
@@ -326,6 +379,7 @@ std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_to
     prompt_len_ = static_cast<int>(input_ids.size());
     if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); }
     // prefill
+    current_modules_ = prefill_modules_;
     auto logits = forward(input_ids);
     if (logits.get() == nullptr) {
         return {};
@@ -334,6 +388,7 @@ std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_to
     output_ids.push_back(token);
     all_ids.push_back(token);
     // decode
+    current_modules_ = decode_modules_;
     while (gen_seq_len_ < max_new_tokens) {
         logits = forward({token});
         if (logits.get() == nullptr) {
@@ -348,17 +403,26 @@ std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_to
 }
 
 std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with) {
+    if (mTracing) {
+        // Skip real forward
+        current_modules_ = prefill_modules_;
+        forward(input_ids);
+        current_modules_ = decode_modules_;
+        forward({input_ids[0]});
+        forward({input_ids[0]});
+        return "Test";
+    }
     prompt_len_ = static_cast<int>(input_ids.size());
     history_ids_.insert(history_ids_.end(), input_ids.begin(), input_ids.end()); // push to history_ids_
     auto st = std::chrono::system_clock::now();
-    modules_ = prefill_modules_;
+    current_modules_ = prefill_modules_;
     auto logits = forward(input_ids);
     if (nullptr == logits.get()) {
         return "";
     }
     int token = sample(logits, history_ids_);
     auto et = std::chrono::system_clock::now();
-    modules_ = decode_modules_;
+    current_modules_ = decode_modules_;
     std::string output_str = decode(token);
     prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
     *os << output_str << std::flush;
@@ -383,6 +447,7 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
         *os << word << std::flush;
         output_str += word;
     }
+    ExecutorScope::Current()->gc(Executor::FULL);
 #ifdef DUMP_PROFILE_INFO
     print_speed();
 #endif
@@ -414,9 +479,9 @@ std::string Llm::response(const std::vector<PromptItem>& chat_prompts, std::ostr
     if (config_->reuse_kv() && all_seq_len_ > 0) {
         prompt = "<|im_end|>\n" + prompt;
     }
-    std::cout << "# prompt : " << prompt << std::endl;
+    // std::cout << "# prompt : " << prompt << std::endl;
     auto input_ids = tokenizer_->encode(prompt);
-    printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n");
+    // printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n");
     return generate(input_ids, os, end_with);
 }
 
@@ -443,6 +508,7 @@ Llm::~Llm() {
         MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer);
     }
 #endif
+    current_modules_.clear();
     decode_modules_.clear();
     prefill_modules_.clear();
     modules_.clear();
diff --git a/transformers/llm/engine/src/llmconfig.hpp b/transformers/llm/engine/src/llmconfig.hpp
index 71ef7291f..b09ab6177 100644
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@@ -227,10 +227,6 @@ class LlmConfig {
     bool reuse_kv() const {
         return config_.value("reuse_kv", false);
     }
-
-    int quant_kv() const {
-        return config_.value("quant_kv", 0);
-    }
     // generate config end >
 
     // < backend config start
@@ -249,6 +245,14 @@ class LlmConfig {
     std::string memory() const {
         return config_.value("memory", "low");
     }
+
+    int quant_kv() const {
+        return config_.value("quant_kv", 0);
+    }
+
+    int kvcache_limit() const {
+        return config_.value("kvcache_limit", -1);
+    }
     // backend config end >
 
     // < llm model config start
diff --git a/transformers/llm/engine/src/tokenizer.cpp b/transformers/llm/engine/src/tokenizer.cpp
index 6330d8885..87f02c868 100644
--- a/transformers/llm/engine/src/tokenizer.cpp
+++ b/transformers/llm/engine/src/tokenizer.cpp
@@ -15,6 +15,7 @@
 #include <regex>
 #include <set>
 #include <climits>
+#include <cctype>
 namespace MNN {
 namespace Transformer {
 
@@ -75,7 +76,7 @@ static std::string base64_decode(const std::string& str) {
 static inline void to_lower_case(std::string& str) {
     for (auto &c : str) {
         if (c >= 'A' && c <= 'Z') {
-            c = std::tolower(static_cast<unsigned char>(c));
+            c = tolower(static_cast<unsigned char>(c));
         }
     }
 }
@@ -540,19 +541,19 @@ void BertTokenizer::encode(const std::string& str, std::vector<int>& ids) {
             }
         }
         // handle continuous sequence of letters and digits
-        else if (std::isalnum(c)) {
-            while (i < str.size() && std::isalnum(static_cast<unsigned char>(str[i]))) {
-                current_token += std::tolower(str[i]);
+        else if (isalnum(c)) {
+            while (i < str.size() && isalnum(static_cast<unsigned char>(str[i]))) {
+                current_token += tolower(str[i]);
                 ++i;
             }
         }
         // handle punctuation and symbols
-        else if (std::ispunct(c)) {
+        else if (ispunct(c)) {
             current_token = str[i];
             ++i;
         }
         // handle space, tab, enter
-        else if (std::isspace(c)) {
+        else if (isspace(c)) {
             ++i;
             continue;
         }