From ae6253fb46e607b733ce5e8eb50b54f83459eaa6 Mon Sep 17 00:00:00 2001 From: xiaying Date: Sat, 24 Aug 2024 15:46:21 +0800 Subject: [PATCH] MNN:Sync: Sync Internal 2.9.4 --- 3rd_party/OpenCLHeaders/CL/cl2.hpp | 4 +- CMakeLists.txt | 21 +- docs/index.rst | 1 + docs/inference/module.md | 37 + docs/tools/script.md | 70 + docs/transformers/diffusion.md | 24 +- docs/transformers/llm.md | 71 +- express/Executor.cpp | 18 +- express/module/Module.cpp | 12 +- express/module/PipelineModule.cpp | 195 ++- express/module/PipelineModule.hpp | 2 + express/module/StaticModule.cpp | 330 +++-- express/module/StaticModule.hpp | 6 + include/MNN/ErrorCode.hpp | 10 + include/MNN/Interpreter.hpp | 20 + include/MNN/MNNDefine.h | 2 +- include/MNN/expr/Executor.hpp | 7 + include/MNN/expr/Module.hpp | 10 +- project/ios/MNN.xcodeproj/project.pbxproj | 44 +- project/ios/Playground/AppDelegate.mm | 52 +- .../mobilenet_finetune/mobilenet_transfer.py | 1 - .../imagenet_dataset.py | 97 -- .../quant_aware_training.py | 125 -- pymnn/pip_package/build_deps.py | 2 +- pymnn/pip_package/pyproject.toml | 5 +- pymnn/test/model_test.py | 13 +- pymnn/test/unit_test.py | 1 + pymnn/update_mnn_wrapper_assets.sh | 2 +- source/backend/arm82/Arm82Functions.cpp | 5 + .../arm64/low_memory/MNNCountMinMax_ARM82.S | 2 +- .../low_memory/MNNPackedMatMulFP16_int4.S | 12 +- .../low_memory/MNNPackedMatMulFP16_int8.S | 12 +- .../MNNPackedMatMulRemainFP16_int4.S | 36 +- .../MNNPackedMatMulRemainFP16_int8.S | 36 +- .../backend/coreml/backend/CoreMLExecutor.mm | 40 +- .../coreml/execution/CoreMLConvolution.cpp | 4 +- source/backend/cpu/CPUAttention.cpp | 491 +------ source/backend/cpu/CPUAttention.hpp | 32 +- source/backend/cpu/CPUBackend.cpp | 10 +- source/backend/cpu/CPUBackend.hpp | 2 +- source/backend/cpu/CPUConvolution.cpp | 105 +- source/backend/cpu/CPUConvolution.hpp | 19 +- .../backend/cpu/CPUConvolutionDepthwise.cpp | 2 +- source/backend/cpu/CPUDeconvolution.cpp | 12 +- source/backend/cpu/CPUDeconvolution.hpp | 15 +- .../backend/cpu/CPUDeconvolutionDepthwise.cpp | 2 +- source/backend/cpu/CPUDepthwiseConvInt8.cpp | 2 +- source/backend/cpu/KVCacheManager.cpp | 467 ++++++ source/backend/cpu/KVCacheManager.hpp | 129 ++ .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S | 21 +- source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S | 16 +- .../backend/cpu/arm/arm64/MNNBilinearLineC8.S | 3 + .../cpu/arm/arm64/MNNBilinearSampleC8.S | 6 +- .../MNNGemmInt8AddBiasScale_ARMV86_Unit.S | 1 + .../backend/cpu/compute/CommonOptFunction.cpp | 84 +- .../backend/cpu/compute/CommonOptFunction.h | 11 + .../cpu/compute/ConvInt8TiledExecutor.cpp | 376 +++-- .../cpu/compute/ConvInt8TiledExecutor.hpp | 12 +- .../backend/cpu/compute/ConvInt8Winograd.cpp | 135 +- .../cpu/compute/ConvolutionFloatFactory.cpp | 16 +- .../cpu/compute/DeconvolutionWithStride.cpp | 4 +- .../compute/DenseConvolutionTiledExecutor.cpp | 97 +- .../backend/cpu/compute/GemmInt8Executor.cpp | 25 +- .../backend/cpu/compute/GemmInt8Executor.hpp | 2 +- .../cpu/compute/IdstConvolutionInt8.cpp | 4 +- .../compute/SparseConvInt8TiledExecutor.cpp | 23 +- .../compute/SparseConvInt8TiledExecutor.hpp | 4 +- .../cuda/execution/ConvCutlassExecution.cu | 10 +- .../cuda/execution/ConvDepthWiseExecution.cu | 42 +- .../cuda/execution/ConvImplicitExecution.cu | 18 +- .../cuda/execution/ConvWinogradExecution.cu | 58 +- .../execution/DeconvSingleInputExecution.cu | 20 +- .../bf16/ConvCutlassBf16Execution.cu | 2 +- .../weight_only_quant/ConvFpAIntBExecution.cu | 18 +- .../backend/hiai/execution/NPUConvolution.cpp | 2 +- .../execution/NPUConvolutionDepthwise.cpp | 4 +- source/backend/metal/MetalConvolution.mm | 14 +- source/backend/metal/MetalConvolution1x1.mm | 22 +- .../backend/metal/MetalConvolutionCommon.hpp | 2 +- .../backend/metal/MetalConvolutionCommon.mm | 86 +- .../metal/MetalConvolutionDepthwise.mm | 8 +- .../backend/metal/MetalConvolutionWinograd.mm | 4 +- source/backend/metal/MetalDeconvolution.mm | 6 +- .../nnapi/execution/NNAPIConvolution.cpp | 2 +- source/backend/opencl/CMakeLists.txt | 4 +- source/backend/opencl/core/OpenCLBackend.cpp | 24 +- source/backend/opencl/core/OpenCLGemmTune.cpp | 21 +- .../opencl/core/OpenCLRunningUtils.cpp | 16 + .../opencl/core/OpenCLRunningUtils.hpp | 4 + .../opencl/core/runtime/OpenCLRuntime.cpp | 26 + .../opencl/core/runtime/OpenCLRuntime.hpp | 7 + .../opencl/core/runtime/OpenCLWrapper.cpp | 37 - .../opencl/core/runtime/OpenCLWrapper.hpp | 9 +- .../execution/buffer/CastBufExecution.cpp | 1 + .../execution/buffer/ConvBufExecution.cpp | 232 ++- .../execution/buffer/ConvBufExecution.hpp | 6 + .../buffer/ConvBufLowMemoryExecution.cpp | 124 +- .../buffer/ConvBufLowMemoryExecution.hpp | 5 +- .../execution/buffer/ConvBufWinograd.cpp | 2 +- .../buffer/ConvSubgroupBufExecution.cpp | 24 +- .../execution/buffer/DeconvBufExecution.cpp | 2 +- .../buffer/DepthwiseConvBufExecution.cpp | 2 +- .../DepthwiseConvSubgroupBufExecution.cpp | 10 +- .../buffer/GroupNormBufExecution.cpp | 3 + .../execution/buffer/MatmulBufExecution.cpp | 46 +- .../buffer/StrassenMatmulOpenCLComputor.cpp | 470 ++++++ .../buffer/StrassenMatmulOpenCLComputor.hpp | 67 + .../execution/cl/buffer_convert_quant.cl | 229 +-- .../execution/cl/gemm_quant_batch_buf.cl | 293 ++-- .../opencl/execution/cl/gemv_conv1x1_buf.cl | 451 +----- .../opencl/execution/cl/groupnorm_buf.cl | 14 +- .../opencl/execution/cl/matmul_params_buf.cl | 224 +-- .../opencl/execution/cl/opencl_program.cc | 1269 +++++++---------- .../opencl/execution/cl/opencl_source_map.hpp | 6 + .../execution/cl/strassen_binary_buf.cl | 101 ++ .../opencl/execution/image/ConvExecution.cpp | 2 +- .../image/ConvLowMemoryExecution.cpp | 128 +- .../image/ConvLowMemoryExecution.hpp | 2 + .../opencl/execution/image/ConvWinograd.cpp | 2 +- .../execution/image/DeconvExecution.cpp | 2 +- .../image/DepthwiseConvExecution.cpp | 2 +- .../image/DepthwiseDeconvExecution.cpp | 2 +- source/backend/opencl/schema/CLCache.fbs | 6 + .../opencl/schema/current/CLCache_generated.h | 144 +- .../tensorrt/execution/TRTConvolution.cpp | 2 +- .../tensorrt/execution/TRTDeconvolution.cpp | 4 +- .../execution/TRTDepthwiseConvolution.cpp | 4 +- .../execution/TRTDepthwiseDeconvolution.cpp | 6 +- source/backend/vulkan/CMakeLists.txt | 2 +- .../vulkan/buffer/backend/VulkanBackend.cpp | 9 +- .../buffer/execution/VulkanConvolution.cpp | 6 +- .../buffer/execution/VulkanDeconvolution.cpp | 9 +- .../buffer/execution/VulkanDeconvolution.hpp | 2 +- .../vulkan/buffer/execution/VulkanUnary.cpp | 2 + .../backend/vulkan/component/VulkanDevice.cpp | 29 +- .../backend/vulkan/component/VulkanDevice.hpp | 3 +- .../vulkan/component/VulkanInstance.cpp | 37 +- .../vulkan/component/VulkanInstance.hpp | 5 - .../vulkan/component/VulkanPipeline.cpp | 3 +- .../vulkan/component/VulkanQueryPool.cpp | 2 +- .../vulkan/image/backend/VulkanBackend.cpp | 5 +- .../vulkan/image/compiler/AllShader.cpp | 1131 +++++++-------- .../vulkan/image/compiler/VulkanShaderMap.cpp | 2 + .../vulkan/image/compiler/makeshader.py | 3 +- .../vulkan/image/execution/VulkanArgMax.cpp | 129 ++ .../vulkan/image/execution/VulkanArgMax.hpp | 40 + .../vulkan/image/execution/VulkanBinary.cpp | 3 + .../image/execution/VulkanConvolution.cpp | 2 +- .../image/execution/VulkanDeconvolution.cpp | 9 +- .../image/execution/VulkanDeconvolution.hpp | 2 +- .../VulkanDeconvolutionDepthwise.cpp | 7 +- .../VulkanDeconvolutionDepthwise.hpp | 2 +- .../vulkan/image/execution/VulkanRaster.cpp | 3 + .../vulkan/image/execution/VulkanUnary.cpp | 2 + .../vulkan/image/execution/glsl/argmax.comp | 51 + .../vulkan/image/execution/glsl/avgpool.comp | 2 +- .../image/execution/glsl/binaryImage.comp | 2 +- .../image/execution/glsl/blit_image.comp | 2 +- .../execution/glsl/convolutionDepthwise.comp | 12 +- .../glsl/convolutionDepthwiseMali.comp | 12 +- .../image/execution/glsl/deconvCol2Im.comp | 4 +- .../image/execution/glsl/deconvIm2Col.comp | 6 +- .../glsl/deconvolutionDepthwise.comp | 8 +- .../image/execution/glsl/fill_image.comp | 2 +- .../image/execution/glsl/gemm16x16.comp | 8 +- .../execution/glsl/gridSampleBilinear.comp | 6 +- .../execution/glsl/gridSampleNearest.comp | 6 +- .../vulkan/image/execution/glsl/im2col.comp | 4 +- .../image/execution/glsl/im2col1x1.comp | 4 +- .../image/execution/glsl/imageTonc4hw4.comp | 2 +- .../image/execution/glsl/imageTonchw.comp | 2 +- .../vulkan/image/execution/glsl/macro.json | 3 + .../image/execution/glsl/matmul_input.comp | 2 +- .../image/execution/glsl/matmul_output.comp | 2 +- .../vulkan/image/execution/glsl/maxpool.comp | 2 +- .../image/execution/glsl/nc4hw4toimage.comp | 2 +- .../image/execution/glsl/nchwToimage.comp | 2 +- .../image/execution/glsl/packAsImage4x4.comp | 2 +- .../execution/glsl/preluWithChannel.comp | 6 +- .../vulkan/image/execution/glsl/relu.comp | 4 +- .../vulkan/image/execution/glsl/relu6.comp | 4 +- .../image/execution/glsl/resizeBilinear.comp | 4 +- .../image/execution/glsl/resizeNearest.comp | 4 +- .../image/execution/glsl/roipooling.comp | 2 +- .../vulkan/image/execution/glsl/scale.comp | 2 +- .../image/execution/glsl/unPackImage4x4.comp | 2 +- .../image/execution/glsl/unaryImage.comp | 2 +- .../glsl/winogradTransformDest2_3_1.comp | 6 +- .../glsl/winogradTransformSource2_3_1.comp | 6 +- .../backend/vulkan/image/shaders/AllShader.h | 4 + .../backend/vulkan/runtime/VulkanRuntime.cpp | 54 +- .../backend/vulkan/runtime/VulkanRuntime.hpp | 3 +- source/backend/vulkan/vulkan/vulkan_core.h | 9 +- source/core/Backend.cpp | 16 - source/core/Backend.hpp | 11 +- source/core/BufferAllocator.cpp | 4 +- source/core/ConvolutionCommon.cpp | 91 +- source/core/ConvolutionCommon.hpp | 6 +- source/core/FileLoader.hpp | 27 +- source/core/IDSTDecoder.hpp | 84 +- source/core/Interpreter.cpp | 11 +- source/core/MNNFileUtils.cpp | 284 ++++ source/core/MNNFileUtils.h | 182 +++ source/core/OpCommonUtils.cpp | 24 +- source/core/Pipeline.cpp | 30 +- source/core/Pipeline.hpp | 1 + source/core/Schedule.cpp | 10 +- source/core/Schedule.hpp | 3 + source/core/Session.cpp | 29 +- source/core/Session.hpp | 1 + source/geometry/GeometryComputer.hpp | 1 + source/geometry/GeometryComputerUtils.cpp | 10 + .../geometry/GeometryConv2DBackPropFilter.cpp | 2 +- source/geometry/GeometryReverseSequence.cpp | 4 +- source/shape/SizeComputer.cpp | 2 +- source/utils/InitNet.cpp | 90 +- source/utils/InitNet.hpp | 2 +- test.sh | 33 +- test/MNNTestSuite.cpp | 1 + test/core/FileUtilsTest.cpp | 320 +++++ test/core/IDSTTest.cpp | 12 +- test/expr/MemoryIncrease.cpp | 75 + test/expr/ModuleShapeInfer.cpp | 108 ++ test/expr/ReverseSequenceTest.cpp | 30 +- test/grad/BinaryGradTest.cpp | 3 + test/grad/GridSampleGradTest.cpp | 3 +- test/grad/PReLUGradTest.cpp | 3 +- test/op/ConvInt8Test.cpp | 1 + test/op/ResizeTest.cpp | 6 +- test/op/ReverseTest.cpp | 22 + .../optimizer/merge/ConvertMatMulToConv2D.cpp | 20 +- .../source/optimizer/onnxextra/OnnxClip.cpp | 13 +- .../onnxextra/OnnxDeQuantizeLinear.cpp | 6 +- .../source/optimizer/onnxextra/OnnxEinsum.cpp | 8 +- .../onnxextra/OnnxQuantizeLinear.cpp | 4 +- .../tflitextra/ConvTranposeTflite.cpp | 52 + .../source/tflite/ConvolutionTflite.cpp | 16 +- .../converter/source/tflite/CustomTflite.cpp | 74 + tools/cpp/ExprDebug.hpp | 6 +- tools/cpp/LoRA.cpp | 3 +- tools/quantization/calibration.cpp | 30 +- tools/script/apply_gptq.py | 37 +- tools/script/apply_lora.py | 156 ++ tools/script/arm_assembly.py | 30 +- tools/script/convertOnnxTest.py | 1 + tools/script/convertTfTest.py | 1 + tools/script/convertTfliteTest.py | 1 + tools/script/convertTorchTest.py | 1 + tools/script/modelTest.py | 1 + tools/script/testPTQ.py | 12 +- tools/train/register.py | 44 + tools/train/source/demo/MobilenetV2Utils.cpp | 32 +- tools/train/source/demo/MobilenetV2Utils.hpp | 4 +- tools/train/source/demo/demoMain.cpp | 2 +- tools/train/source/demo/mobilenetV2Train.cpp | 83 +- tools/train/source/grad/BinaryGrad.cpp | 10 +- tools/train/source/grad/BroadcastToGrad.cpp | 10 +- tools/train/source/grad/ConcatGrad.cpp | 13 +- tools/train/source/grad/ConvGrad.cpp | 16 +- tools/train/source/grad/GatherGrad.cpp | 12 +- tools/train/source/grad/GradOPRegister.cpp | 65 + tools/train/source/grad/GridSampleGrad.cpp | 12 +- tools/train/source/grad/InterpGrad.cpp | 12 +- tools/train/source/grad/LoopGrad.cpp | 12 +- tools/train/source/grad/MatMulGrad.cpp | 12 +- .../train/source/grad/MatrixBandPartGrad.cpp | 12 +- tools/train/source/grad/OpGrad.cpp | 9 + tools/train/source/grad/OpGrad.hpp | 6 + tools/train/source/grad/PermuteGrad.cpp | 12 +- tools/train/source/grad/PoolGrad.cpp | 12 +- tools/train/source/grad/RasterGrad.cpp | 12 +- tools/train/source/grad/ReduceGrad.cpp | 12 +- tools/train/source/grad/ReluGrad.cpp | 12 +- tools/train/source/grad/RenderGrad.cpp | 12 +- tools/train/source/grad/ReshapeGrad.cpp | 12 +- tools/train/source/grad/RoiAlignGrad.cpp | 12 +- tools/train/source/grad/RoiPoolGrad.cpp | 12 +- tools/train/source/grad/ScaleGrad.cpp | 12 +- tools/train/source/grad/SelectGrad.cpp | 12 +- tools/train/source/grad/SeluGrad.cpp | 12 +- tools/train/source/grad/SliceGrad.cpp | 12 +- tools/train/source/grad/SoftmaxGrad.cpp | 12 +- tools/train/source/grad/StridedSliceGrad.cpp | 12 +- tools/train/source/grad/TensorConvertGrad.cpp | 12 +- tools/train/source/grad/TopKV2Grad.cpp | 12 +- tools/train/source/grad/UnaryGrad.cpp | 12 +- tools/train/source/grad/ZeroGrad.cpp | 12 +- tools/train/source/models/MobilenetV2.cpp | 45 +- tools/train/source/models/MobilenetV2.hpp | 2 +- transformers/diffusion/main.cpp | 32 +- transformers/diffusion/pipeline.cpp | 80 +- transformers/diffusion/pipeline.hpp | 17 +- transformers/llm/config.json | 10 +- transformers/llm/engine/CMakeLists.txt | 30 +- transformers/llm/engine/include/llm/llm.hpp | 11 +- transformers/llm/engine/llm_demo.cpp | 20 +- transformers/llm/engine/src/llm.cpp | 88 +- transformers/llm/engine/src/llmconfig.hpp | 12 +- transformers/llm/engine/src/tokenizer.cpp | 13 +- 299 files changed, 7931 insertions(+), 4556 deletions(-) create mode 100644 docs/tools/script.md delete mode 100644 pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py delete mode 100644 pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py create mode 100644 source/backend/cpu/KVCacheManager.cpp create mode 100644 source/backend/cpu/KVCacheManager.hpp create mode 100644 source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp create mode 100644 source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp create mode 100644 source/backend/opencl/execution/cl/strassen_binary_buf.cl create mode 100644 source/backend/vulkan/image/execution/VulkanArgMax.cpp create mode 100644 source/backend/vulkan/image/execution/VulkanArgMax.hpp create mode 100644 source/backend/vulkan/image/execution/glsl/argmax.comp create mode 100644 source/core/MNNFileUtils.cpp create mode 100644 source/core/MNNFileUtils.h create mode 100644 test/core/FileUtilsTest.cpp create mode 100644 test/expr/ModuleShapeInfer.cpp create mode 100644 tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp create mode 100644 tools/script/apply_lora.py create mode 100644 tools/train/register.py create mode 100644 tools/train/source/grad/GradOPRegister.cpp diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp index 491285264..4db4f7cf6 100644 --- a/3rd_party/OpenCLHeaders/CL/cl2.hpp +++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp @@ -805,9 +805,9 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL) #define __GET_GL_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetGLObjectInfo) #if CL_HPP_TARGET_OPENCL_VERSION >= 120 #define __CREATE_IMAGE_ERR CL_HPP_ERR_STR_(clCreateImage) -#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture) #define __IMAGE_DIMENSION_ERR CL_HPP_ERR_STR_(Incorrect image dimensions) #endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture) #define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback) #define __CREATE_USER_EVENT_ERR CL_HPP_ERR_STR_(clCreateUserEvent) @@ -5229,7 +5229,6 @@ class Image3DGL : public Image3D }; #endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS -#if CL_HPP_TARGET_OPENCL_VERSION >= 120 /*! \class ImageGL * \brief general image interface for GL interop. * We abstract the 2D and 3D GL images into a single instance here @@ -5308,7 +5307,6 @@ class ImageGL : public Image return *this; } }; -#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 diff --git a/CMakeLists.txt b/CMakeLists.txt index 006ae131f..f117f9d1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,6 +673,15 @@ IF(MNN_TENSORRT) list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS}) ENDIF() +IF(MNN_BUILD_LLM) + # add_definitions(-DMNN_BUILD_LLM) + include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt) + IF(NOT MNN_SEP_BUILD) + list(APPEND MNN_TARGETS llm) + list(APPEND MNN_OBJECTS_TO_LINK $) + ENDIF() +ENDIF() + IF(MNN_SEP_BUILD) add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS} ${MNN_EXTRA_HEADERS}) target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS}) @@ -744,13 +753,7 @@ IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD) ENDIF() target_sources(MNN PRIVATE $) ENDIF() -IF(MNN_BUILD_LLM) - # add_definitions(-DMNN_BUILD_LLM) - include(${CMAKE_CURRENT_LIST_DIR}/transformers/llm/engine/CMakeLists.txt) - IF(NOT MNN_SEP_BUILD) - target_sources(MNN PRIVATE $) - ENDIF() -ENDIF() + if(CMAKE_SYSTEM_NAME MATCHES "^Linux") # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread @@ -761,9 +764,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android") else() endif() if (NOT MNN_BUILD_SHARED_LIBS) - if(APPLE) - set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load) - elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") # Static-link will not replace thread-related weak symbol in glibc with strong symbol # in pthread library, so we need use --whole-archive to pthread # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why diff --git a/docs/index.rst b/docs/index.rst index 8c97f2410..827a85235 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -82,6 +82,7 @@ tools/compress tools/visual tools/python + tools/script .. toctree:: :maxdepth: 1 diff --git a/docs/inference/module.md b/docs/inference/module.md index 22347a576..7ec90a8a4 100644 --- a/docs/inference/module.md +++ b/docs/inference/module.md @@ -200,6 +200,9 @@ MNN::Express::ExecutorScope scope(executor); module_thread.reset(); ``` +## 多线程 +Module 的创建与运行依赖其所绑定的 Executor ,若不指定,则为全局 Executor ,并非线程安全。在多线程创建 Module 或进行推理时,会竞争全局 Executor 的资源,需要上锁或绑定不同的 Executor 。 + ## 调试 Module API 也支持使用回调函数进行调试,与[runSessionWithCallBack](session.html#id19)相似。示例代码: @@ -232,6 +235,40 @@ Express::Executor::getGlobalExecutor()->setCallBack(std::move(beforeCallBack), s std::vector outputs = user_module->onForward(inputs); ``` +## 预推理分离模式 +对于满足 Interpreter-Session 运行条件的模型,若用户希望分离预推理(形状计算,几何计算,资源申请,策略搜索)与推理(内容计算)过程,可以设置预推理分离模式。示例代码如下: + +```cpp +std::shared_ptr net(Module::load({"x"}, {"y"}, (const uint8_t*)buffer.data(), buffer.size()), Module::destroy); +// 预推理分离模式 +auto code = net->traceOrOptimize(Interpreter::Module_Forward_Seperate); +if (0 != code) { + // 若模型不支持预推理分离,需要还原设定 + net->traceOrOptimize(Interpreter::Module_Forward_Combine); +} + +/*预推理开始*/ +x = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); +auto input = x->writeMap(); +y = net->onForward({x})[0]; +auto output = y->readMap(); + +/*预推理结束,获取输入和输出的数据指针*/ + +/*内容计算*/ +/* +Fill input +*/ + +// 输入空数组,表示仅进行推理 +net1->onForward({}); + +/* +Use output +*/ + +``` + ## 示例代码 完整的示例代码可以参考`demo/exec/`文件夹中的以下源码文件: - `pictureRecognition_module.cpp` 使用`Module`执行图像分类,使用`ImageProcess`进行前处理,`Expr`进行后处理 diff --git a/docs/tools/script.md b/docs/tools/script.md new file mode 100644 index 000000000..551b8e7b1 --- /dev/null +++ b/docs/tools/script.md @@ -0,0 +1,70 @@ +# 脚本工具 +一些功能性脚本,提供各种功能。 + +## apply_gptq.py +将GPTQ的权重写入到量化的MNN权重中。 + +### 用法 +``` +usage: apply_gptq.py [-h] --mnn_graph MNN_GRAPH --mnn_weight MNN_WEIGHT --gptq_tensor GPTQ_TENSOR + +apply_gptq + +options: + -h, --help show this help message and exit + --mnn_graph MNN_GRAPH + mnn graph json path. + --mnn_weight MNN_WEIGHT + mnn weight file path. + --gptq_tensor GPTQ_TENSOR + gptq tensor path. +``` + +### 参数 +- MNN_GRAPH: 模型计算图的json文件,获取方法:`./MNNDump2Json model.mnn model.json` +- MNN_WEIGHT: 模型的权重文件,如:`gptq.mnn.weight` +- GPTQ_TENSOR: GPTQ量化后的权重文件,`model.safetensor` + +### 示例 +使用该脚本生成gptq量化的权重`gptq.mnn.weight` +```sh +cd build +./MNNDump2Json model.mnn model.json +cp model.mnn.weight gptq.mnn.weight +python ../tools/script/apply_gptq.py --mnn_graph model.json --mnn_weight gptq.mnn.weight --gptq_tensor model.safetensor +``` + +## apply_lora.py + +合并base模型的计算图和lora模型的权重文件,生成新的计算图。 + +### 用法 +```sh +usage: apply_lora.py [-h] --base BASE --lora LORA [--scale SCALE] [--fuse FUSE] [--out OUT] + +apply_lora + +options: + -h, --help show this help message and exit + --base BASE base model json path. + --lora LORA lora dir path or *.safetensors path. + --scale SCALE lora scale: `alpha/r`. + --fuse FUSE fuse A and B. + --out OUT out file name. +``` + +### 参数 +- BASE: base.json, base模型计算图的json文件,获取方法:`./MNNDump2Json base.mnn base.json` +- LORA: lora权重文件夹或者lora权重的safetensors +- SCALE: lora权重的scale, `lora_alpha / lora_r`, 一般为4.0 +- FUSE: 是否将lora_A与lora_B合并成为一个lora权重,合并后模型较大 +- OUT: 生成新的计算图文件名,默认为`lora.json`,转换为模型:`./MNNRevert2Buffer lora.json lora.mnn` + +### 示例 +使用该脚本生成lora对应的模型`lora.mnn`, 用法: [LoRA](../transformers/llm.html#lora) +```sh +cd build +./MNNDump2Json base.mnn base.json +python ../tools/script/apply_lora.py --base base.json --lora lora_dir +./MNNRevert2Buffer lora.json lora.mnn +``` \ No newline at end of file diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md index 32d790a26..70e64766b 100644 --- a/docs/transformers/diffusion.md +++ b/docs/transformers/diffusion.md @@ -35,9 +35,10 @@ conda activate ldm ``` ./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz ``` -2. 实现denoiser从onnx模型 -> mnn模型 +2. 实现denoiser unet从onnx模型 -> mnn模型 ``` ./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz +注意:对于非OpenCL后端推理,需要去掉--transformerFuse。 ``` 3. 实现decoder从onnx模型 -> mnn模型 ``` @@ -60,19 +61,26 @@ cd mnn_path/project/android/build ``` ## 运行Diffusion Demo ``` -./diffusion_demo +./diffusion_demo ``` 其中,resource_path 就是mnn模型文件的路径,除了mnn文件,还需要: 1. 将MNN目录transformers/diffusion/scheduler/alphas.txt文件拷贝到该文件夹下。 -2. 针对stable-diffusion-v1-5模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。 +2. 针对stable-diffusion-v1-5/chilloutmix模型需要将huggingfacetokenizer目录下merges.txt和vocab.json拷贝到该文件夹中。 3. 针对Taiyi-Stable-Diffusion模型需要将huggingfacetokenizer目录下vocab.txt拷贝到该文件夹中。 -4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5模型设为0,如果是Taiyi-Stable-Diffusion模型设为1。 +4. model_type是目前支持的两种diffusion模型的类别。如果是stable-diffusion-v1-5/chilloutmix模型设为0,如果是Taiyi-Stable-Diffusion模型设为1。 5. output_image_name是生成图片的名字,默认图片位置在当前运行目录下。 -6. input_text是文生图的prompt,如果是stable-diffusion-v1-5模型建议英文prompt,如果是Taiyi-Stable-Diffusion建议中文prompt。 +6. memory_mode代表设备是否内存足够,设为0表示内存节约模式(demo中每个模型使用前等待初始化,用完释放),1代表内存足够模式(所有模式启动时全初始化完,用时无需等待初始化)。 +7. backend_type代表选择的运行后端。 +8. input_text是文生图的prompt,如果是stable-diffusion-v1-5/chilloutmix模型建议英文prompt,如果是Taiyi-Stable-Diffusion建议中文prompt。 运行指令例如: ``` -./diffusion_demo mnn_sd1.5_path 0 demo.jpg "a cute cat" -./diffusion_demo mnn_chilloutmix_path 0 demo.jpg "a pure girl" -./diffusion_demo mnn_taiyi_path 1 demo.jpg "一只可爱的猫" +./diffusion_demo mnn_sd1.5_path 0 demo.jpg 0 3 "a cute cat" +./diffusion_demo mnn_chilloutmix_path 0 demo.jpg 0 3 "a pure girl" +./diffusion_demo mnn_taiyi_path 1 demo.jpg 0 3 "一只可爱的猫" ``` +## FAQ +1. Demo运行报错、段错误,怎么解决? +- 常见错误可能是设备内存不足,通常支持opencl fp16的设备需要保证3GB以上的内存,不支持fp16则需要6GB以上显存了。 +2. 使用其他后端,出现报错,什么原因? +- 目前其他后端暂不支持transformer插件算子,需要在onnx->mnn模型转换阶段,去掉--transformerFuse。 diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md index 2358548c6..5e77ab0cb 100644 --- a/docs/transformers/llm.md +++ b/docs/transformers/llm.md @@ -110,7 +110,7 @@ options: ### 编译 -[从源码编译](../compile/tools.html#id4) +[从源码编译](../compile/other.html#id4) ### 使用 #### 运行时配置 @@ -151,7 +151,7 @@ options: - 3: 使用非对称8bit量化存储key,使用fp8格式寸处value - 硬件配置 - backend_type: 推理使用硬件后端类型,默认为:`"cpu"` - - thread_num: 推理使用硬件线程数,默认为:`4` + - thread_num: CPU推理使用硬件线程数,默认为:`4`; OpenCL推理时使用`68` - precision: 推理使用精度策略,默认为:`"low"`,尽量使用`fp16` - memory: 推理使用内存策略,默认为:`"low"`,开启运行时量化 @@ -201,4 +201,69 @@ options: ./llm_demo model_dir/llm.mnn ## 针对prompt中的每行进行回复 ./llm_demo model_dir/llm.mnn prompt.txt -``` \ No newline at end of file +``` + +#### GPTQ权重加载 +- 使用脚本生成GPTQ模型权重,用法参考: [apply_gptq.py](../tools/script.html#apply-gptq-py) +- 创建`gptq.json`配置文件 + ```json + { + "llm_model": "model.mnn", + "llm_weight": "gptq.mnn.weight", + } + ``` + + +#### LoRA权重加载 +- 使用脚本生成lora模型,用法参考: [apply_lora.py](../tools/script.html#apply-lora-py) +- lora模型使用 + - 直接加载lora模型使用,创建`lora.json`配置文件 + ```json + { + "llm_model": "lora.mnn", + "llm_weight": "base.mnn.weight", + } + ``` + - 运行时选择并切换lora模型 + ```cpp + // 创建并加载base模型 + std::unique_ptr llm(Llm::createLLM(config_path)); + llm->load(); + // 使用同一个对象,在多个lora模型之间选择性使用,不可以并发使用 + { + // 在基础模型的基础上添加`lora_1`模型,模型的索引为`lora_1_idx` + size_t lora_1_idx = llm->apply_lora("lora_1.mnn"); + llm->response("Hello lora1"); // 使用`lora_1`模型推理 + // 添加`lora_2`模型,并使用 + size_t lora_2_idx = llm->apply_lora("lora_2.mnn"); + llm->response("Hello lora2"); // 使用`lora_2`模型推理 + // 通过索引选择`lora_1`作为llm对象当前使用的模型 + llm->select_module(lora_1_idx); + llm->response("Hello lora1"); // 使用`lora_1`模型推理 + // 释放加载的lora模型 + llm->release_module(lora_1_idx); + llm->release_module(lora_2_idx); + // 选择使用基础模型 + llm->select_module(0); + llm->response("Hello base"); // 使用`base`模型推理 + } + // 使用多个对象,可以并发的加载使用多个lora模型 + { + std::mutex creat_mutex; + auto chat = [&](const std::string& lora_name) { + MNN::BackendConfig bnConfig; + auto newExe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1); + ExecutorScope scope(newExe); + Llm* current_llm = nullptr; + { + std::lock_guard guard(creat_mutex); + current_llm = llm->create_lora(lora_name); + } + current_llm->response("Hello"); + }; + std::thread thread1(chat, "lora_1.mnn"); + std::thread thread2(chat, "lora_2.mnn"); + thread1.join(); + thread2.join(); + } + ``` \ No newline at end of file diff --git a/express/Executor.cpp b/express/Executor.cpp index f6b85765c..437d72df6 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -48,7 +48,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& std::shared_ptr bn(creator->onCreate(info)); mRuntimes[mAttr->firstType] = bn; } else { - firstIter->second->onReset(numberThread, &config); + firstIter->second->onReset(numberThread, &config, true); } } else { auto creator = MNNGetExtraRuntimeCreator(type); @@ -69,7 +69,7 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& std::shared_ptr bn(creator->onCreate(info)); mRuntimes[mAttr->firstType] = bn; } else { - firstIter->second->onReset(numberThread, &config); + firstIter->second->onReset(numberThread, &config, true); } } _refreshRuntime(); @@ -147,10 +147,6 @@ static std::shared_ptr* gExecutor = nullptr; std::shared_ptr Executor::getGlobalExecutor() { std::call_once(gInitFlag, [&]() { auto creator = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU); -#ifdef MNN_BUILD_MINI - SizeComputerSuite::init(); - GeometryComputer::init(); -#endif Backend::Info info; info.type = MNN_FORWARD_CPU; info.numThread = 1; @@ -158,7 +154,9 @@ std::shared_ptr Executor::getGlobalExecutor() { RuntimeHint hint; hint.memoryAllocatorType = 0;// Defer bn->setRuntimeHint(hint); - gExecutor = new std::shared_ptr(new Executor(bn, MNN_FORWARD_CPU, 1)); + static std::shared_ptr executorStatic; + executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1)); + gExecutor = &executorStatic; }); return *gExecutor; } @@ -254,6 +252,10 @@ void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) { void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) { mInside->modes.setHint(mode, value); } +void Executor::RuntimeManager::setExternalPath(std::string path, int type) { + mInside->modes.setExternalPath(path, type); +} + bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void* ptr) { // Only support get memory switch (code) { @@ -320,7 +322,7 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S } originRt.insert(std::make_pair(compute.type, std::shared_ptr(newBn))); } else { - iter->second->onReset(compute.numThread, compute.user); + iter->second->onReset(compute.numThread, compute.user, false); } res->mInside->mRuntime.second = originRt[DEFAULT_BACKUP_RUNTIME_KEY]; res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type])); diff --git a/express/module/Module.cpp b/express/module/Module.cpp index 82172bfbd..4ba49c27a 100644 --- a/express/module/Module.cpp +++ b/express/module/Module.cpp @@ -481,10 +481,18 @@ Module* Module::extract(std::vector inputs, std::vectoronOptimize(stage); + if (code != 0) { + // Has Error + return code; + } for (auto& m : mChildren) { - m->traceOrOptimize(stage); + code = m->traceOrOptimize(stage); + if (code != 0) { + return code; + } } - return this->onOptimize(stage); + return code; } diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp index 932ae6daa..ee9f4d496 100644 --- a/express/module/PipelineModule.cpp +++ b/express/module/PipelineModule.cpp @@ -17,6 +17,7 @@ #include "core/Backend.hpp" #include "core/WrapExecution.hpp" #include "core/FileLoader.hpp" +#include "core/OpCommonUtils.hpp" #include "utils/InitNet.hpp" #include "RuntimeAttr.hpp" #include "geometry/GeometryComputer.hpp" @@ -197,8 +198,27 @@ std::vector PipelineModule::countOutputReference(std::vector outputInd } return countResult; } +int PipelineModule::onOptimize(Interpreter::SessionMode stage) { + if (stage == Interpreter::Module_Forward_Separate) { + if (mSubModules.size() == 1 && std::get<0>(mSubModules[0])->type() == "StaticModule") { + mSeperate = true; + return 0; + } + return NOT_SUPPORT; + } else if (stage == Interpreter::Module_Forward_Combine) { + mSeperate = false; + } + return 0; +} std::vector PipelineModule::onForward(const std::vector& inputs) { + if (mSeperate && inputs.empty()) { + for (int index = 0; index < mSubModules.size(); ++index) { + auto& m = mSubModules[index]; + std::get<0>(m)->onForward(inputs); + } + return {}; + } std::vector mStack(mStackSize); for (int i = 0; i < mInitVars.size(); ++i) { mStack[i + mInputSize] = mInitVars[i]; @@ -386,6 +406,93 @@ static std::vector _collectNeededOps(const MNN::Net* net, const std::set _findBreakIndex(const SubModuleInfo& info, const Net* net, std::shared_ptr sharedConst) { + // 0: not used, 1: const, 2: output + std::vector constMask(sharedConst->allTensors.size(), 0); + for (int i=0; iallTensors.size(); ++i) { + if(sharedConst->allTensors[i].get() != nullptr) { + constMask[i] = 1; + } + } + for (int v = 0; v < info.opList.size(); ++v) { + auto op = net->oplists()->GetAs(info.opList[v]); + if (nullptr == op->outputIndexes()) { + continue; + } + bool isConst = true; + if (nullptr != op->inputIndexes()) { + for (int i=0; iinputIndexes()->size(); ++i) { + auto index = op->inputIndexes()->data()[i]; + if (constMask[index]) { + continue; + } + if (OpCommonUtils::opNeedContent(op, i)) { + isConst = false; + break; + } + } + } + if (isConst) { + for (int i=0; ioutputIndexes()->size(); ++i) { + auto index = op->outputIndexes()->data()[i]; + constMask[index] = 1; + } + } + } + std::vector res; + // Check Break Index + for (int v = 0; v < info.opList.size(); ++v) { + auto op = net->oplists()->GetAs(info.opList[v]); + if (nullptr == op->outputIndexes() || nullptr == op->inputIndexes()) { + continue; + } + int inputNum = op->inputIndexes()->size(); + auto dims = SizeComputer::needInputContent(op, inputNum); + for (auto index : dims) { + if (index < inputNum) { + if (constMask[op->inputIndexes()->data()[index]] != 1) { + res.emplace_back(v); + break; + } + } + } + } + return res; +} +static std::vector _splitSubModuleForShapeConst(const std::vector& origin, const Net* net, std::shared_ptr sharedConst) { + std::vector res; + for (auto& m : origin) { + if (m.isBreak) { + res.emplace_back(std::move(m)); + continue; + } + auto breakIndexes = _findBreakIndex(m, net, sharedConst); + if (breakIndexes.size() > 0) { + int current = 0; + for (auto breakIndex : breakIndexes) { + // Split + if (breakIndex > current) { + SubModuleInfo m0; + m0.opList.insert(m0.opList.begin(), m.opList.begin() + current, m.opList.begin() + breakIndex); + res.emplace_back(std::move(m0)); + } + SubModuleInfo m1; + m1.opList = {m.opList[breakIndex]}; + res.emplace_back(std::move(m1)); + current = breakIndex + 1; + } + if (current < m.opList.size()) { + SubModuleInfo m2; + m2.opList.insert(m2.opList.begin(), m.opList.begin() + current, m.opList.end()); + res.emplace_back(std::move(m2)); + } + } else { + res.emplace_back(std::move(m)); + } + } + return res; +} + static std::vector _createSubModuleInfo(std::shared_ptr bufferStorage, const std::set& inputIndexes, const std::set& outputIndexes, const std::set& noComputeIndexes, std::shared_ptr sharedConst) { std::vector submodule; auto net = flatbuffers::GetRoot(bufferStorage->buffer()); @@ -400,8 +507,6 @@ static std::vector _createSubModuleInfo(std::shared_ptr 0) { // Not empty - // Init tensormask - _computeTensorMask(current, net); submodule.emplace_back(std::move(current)); } SubModuleInfo controlOp; @@ -421,13 +526,14 @@ static std::vector _createSubModuleInfo(std::shared_ptr bufferStorage, co scheduleInfo.defaultBackend = sharedConst->defaultBackend; scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend; scheduleInfo.allTensors = sharedConst->allTensors; - scheduleInfo.validForResize = initTensors(scheduleInfo.allTensors, net); + scheduleInfo.validForResize = initTensors(scheduleInfo.allTensors, net, info.opList.data(), info.opList.size()); std::vector oplists; std::vector ops; ops.reserve(info.opList.size()); @@ -633,10 +739,7 @@ Module* PipelineModule::load(const std::vector& inputs, const std:: modRuntime.userConfig = &rtMgr->getInside()->mConfig; modRuntime.compute.type = modRuntime.rt.first.begin()->first; modRuntime.compute.numThread = 1; - // set allocator type modRuntime.rt.first.begin()->second->setRuntimeHint(rtMgr->getInside()->modes.runtimeHint); - // set winograd memory type - modRuntime.rt.second->setRuntimeHint(rtMgr->getInside()->modes.runtimeHint); } auto& rt = modRuntime.rt; auto firstRt = rt.first[modRuntime.compute.type]; @@ -687,16 +790,51 @@ Module* PipelineModule::load(const std::vector& inputs, const std:: MNN_ERROR("\n"); return nullptr; } - for (auto index : noneedComputeIndexes) { - auto tensor = Tensor::clone(sharedConst->allTensors[index].get()); - auto constVar = Variable::create(Expr::create(tensor, true)); - initVars.insert(std::make_pair(index, constVar)); - } auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst); std::vector> subModules(subModulesInfo.size()); for (int i=0; iconstReplaceBackend.get(); + if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) { + for (auto& t : sharedConst->allTensors) { + if (nullptr == t.get()) { + continue; + } + auto des = TensorUtils::getDescribe(t.get()); + if (des->isMutable) { + continue; + } + if (!WrapExecution::needWrap(t.get(), curBackend)) { + continue; + } + if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) { + continue; + } + if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) { + continue; + } + std::shared_ptr wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend); + auto outDes = TensorUtils::getDescribe(wrapTensor.get()); + outDes->usage = des->usage; + auto tempRes = WrapExecution::allocAndCopy(curBackend, t.get(), wrapTensor.get()); + if (!tempRes) { + continue; + } + outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE; + WrapExecution::copyReplaceTensor(wrapTensor.get(), t.get()); + } + } + // Clear CPU Const memory + rt.second->onGabageCollect(0); + } + for (auto index : noneedComputeIndexes) { + auto tensor = Tensor::clone(sharedConst->allTensors[index].get()); + auto constVar = Variable::create(Expr::create(tensor, true)); + initVars.insert(std::make_pair(index, constVar)); + } auto result = new PipelineModule; result->mInputSize = inputs.size(); /** @@ -751,39 +889,6 @@ Module* PipelineModule::load(const std::vector& inputs, const std:: } result->registerModel(subModules); result->mSharedConst = sharedConst; - if (!permitCodeGen) { - // Prereplace const tensor - auto curBackend = sharedConst->constReplaceBackend.get(); - if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) { - for (auto& t : sharedConst->allTensors) { - if (nullptr == t.get()) { - continue; - } - auto des = TensorUtils::getDescribe(t.get()); - if (des->isMutable) { - continue; - } - if (!WrapExecution::needWrap(t.get(), curBackend)) { - continue; - } - if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) { - continue; - } - if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) { - continue; - } - std::shared_ptr wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend); - auto outDes = TensorUtils::getDescribe(wrapTensor.get()); - outDes->usage = des->usage; - auto tempRes = WrapExecution::allocAndCopy(curBackend, t.get(), wrapTensor.get()); - if (!tempRes) { - continue; - } - outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE; - WrapExecution::copyReplaceTensor(wrapTensor.get(), t.get()); - } - } - } return result; } diff --git a/express/module/PipelineModule.hpp b/express/module/PipelineModule.hpp index 198f06063..9b28c0c96 100644 --- a/express/module/PipelineModule.hpp +++ b/express/module/PipelineModule.hpp @@ -49,6 +49,7 @@ class PipelineModule : public Module { MNN_PUBLIC PipelineModule(std::vector inputs, std::vector outputs, const Transformer& transformFunction = {}); + int onOptimize(Interpreter::SessionMode stage) override; private: static Module* load(const std::vector& inputs, const std::vector& outputs, std::shared_ptr bufferStorage, const std::shared_ptr rtMgr, const Module::Config* config, std::map& subGraphMap); static void _createSubGraph(const MNN::Net* net, std::shared_ptr rtMgr, const Module::Config* config, std::map& subGraphMap); @@ -64,6 +65,7 @@ class PipelineModule : public Module { friend class NN; std::vector mInitVars; std::shared_ptr mSharedConst; + bool mSeperate = false; }; } // namespace Express } // namespace MNN diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp index f382d2ff6..ec5fd2982 100644 --- a/express/module/StaticModule.cpp +++ b/express/module/StaticModule.cpp @@ -22,14 +22,41 @@ namespace MNN { namespace Express { +static const StaticModule* getStaticModule(const Module* m) { + if (m->type() == "StaticModule") { + return static_cast(m); + } + if (m->getChildren().empty()) { + return nullptr; + } + return getStaticModule(m->getChildren()[0].get()); +} + static std::vector> preRearrangeWeights( // NOLINT - Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend) { + Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend, const Module* base = nullptr) { + std::map> base_executions; + if (base != nullptr) { + // has base module + auto static_module = getStaticModule(base); + if (static_module) { + auto session = static_module->getSession(); + std::vector op_caches = session->getPipelineInfo(0).second; + for (auto& op_cache : op_caches) { + const auto& exe_cache = op_cache.executionCache; + for (const auto& exe_item : exe_cache) { + if (exe_item.first->name()) { + base_executions.insert(std::make_pair(exe_item.first->name()->str(), exe_item.second)); + } + } + } + } + } FileLoader loader(scheduleInfo.externalWeightPath.c_str()); auto&& pipelineInfo = scheduleInfo.pipelineInfo[0].second; std::vector> splitOps(pipelineInfo.size()); for (int i = 0; i < pipelineInfo.size(); ++i) { auto& info = pipelineInfo[i]; - auto op = pipelineInfo[i].op; + auto op = pipelineInfo[i].op; std::unique_ptr op_table(op->UnPack()); std::shared_ptr exe; switch (op->type()) { @@ -37,52 +64,68 @@ static std::vector> preRearrangeWeights( // NOLIN case MNN::OpType_ConvInt8: case MNN::OpType_ConvolutionDepthwise: case MNN::OpType_Convolution: { - DataType type = DataType_DT_FLOAT; - auto conv2d = op->main_as_Convolution2D(); - // Create Default Inputs and Outputs - auto tempInput = info.inputs[0]; - auto tempOutput = info.outputs[0]; - auto common = conv2d->common(); - if (scheduleInfo.pipelineInfo[0].first.needComputeGeometry) { - // Set default shape to create execution - int ow = 2, oh = 2; - int iw = (common->kernelX() - 1) * common->dilateX() + common->strideX() * (ow - 1) + 1; - int ih = (common->kernelY() - 1) * common->dilateY() + common->strideY() * (oh - 1) + 1; - TensorUtils::getDescribe(tempInput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;; - tempInput->setLength(0, 1); - tempInput->setLength(1, conv2d->common()->inputCount()); - tempInput->setLength(2, ih); - tempInput->setLength(3, iw); - TensorUtils::getDescribe(tempOutput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;; - tempOutput->setLength(0, 1); - tempOutput->setLength(1, conv2d->common()->outputCount()); - tempOutput->setLength(2, oh); - tempOutput->setLength(3, ow); - if (op->main_as_Convolution2D()->quanParameter()) { - type = DataType_DT_INT8; - int inputIdx = op->inputIndexes()->Get(0); - auto& inputQuantAttr = TensorUtils::getDescribe(tempInput)->quantAttr; - if (nullptr != inputQuantAttr.get()) { - TensorUtils::getDescribe(tempInput)->type = DataType_DT_INT8; + if (!base_executions.empty() && op->name()) { + auto iter = base_executions.find(op->name()->str()); + if (iter != base_executions.end()) { + auto base_exe = iter->second.get(); + Execution* copyExecution = nullptr; + base_exe->onClone(backend, op, ©Execution); + if (copyExecution == nullptr) { + base_exe->onClone(backupBackend, op, ©Execution); } - auto& outputQuantAttr = TensorUtils::getDescribe(tempOutput)->quantAttr; - if (nullptr != outputQuantAttr.get()) { - TensorUtils::getDescribe(tempOutput)->type = DataType_DT_INT8; + if (copyExecution != nullptr && copyExecution->onClone(nullptr, op, nullptr)) { + exe.reset(copyExecution); } } } - std::shared_ptr tmpstorage; - exe.reset(OpCommonUtils::createExecutionWithExternal(backend, info.inputs, info.outputs, op, &loader, tmpstorage)); - if (exe.get() == nullptr) { - exe.reset(OpCommonUtils::createExecutionWithExternal(backupBackend, info.inputs, info.outputs, op, &loader, tmpstorage)); - } - if (nullptr == exe) { - break; - } - // The exe can't clone - if (!exe->onClone(nullptr, op, nullptr)) { - exe = nullptr; - break; + if (exe == nullptr) { + DataType type = DataType_DT_FLOAT; + auto conv2d = op->main_as_Convolution2D(); + // Create Default Inputs and Outputs + auto tempInput = info.inputs[0]; + auto tempOutput = info.outputs[0]; + auto common = conv2d->common(); + if (scheduleInfo.pipelineInfo[0].first.needComputeGeometry) { + // Set default shape to create execution + int ow = 2, oh = 2; + int iw = (common->kernelX() - 1) * common->dilateX() + common->strideX() * (ow - 1) + 1; + int ih = (common->kernelY() - 1) * common->dilateY() + common->strideY() * (oh - 1) + 1; + TensorUtils::getDescribe(tempInput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;; + tempInput->setLength(0, 1); + tempInput->setLength(1, conv2d->common()->inputCount()); + tempInput->setLength(2, ih); + tempInput->setLength(3, iw); + TensorUtils::getDescribe(tempOutput)->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;; + tempOutput->setLength(0, 1); + tempOutput->setLength(1, conv2d->common()->outputCount()); + tempOutput->setLength(2, oh); + tempOutput->setLength(3, ow); + if (op->main_as_Convolution2D()->quanParameter()) { + type = DataType_DT_INT8; + int inputIdx = op->inputIndexes()->Get(0); + auto& inputQuantAttr = TensorUtils::getDescribe(tempInput)->quantAttr; + if (nullptr != inputQuantAttr.get()) { + TensorUtils::getDescribe(tempInput)->type = DataType_DT_INT8; + } + auto& outputQuantAttr = TensorUtils::getDescribe(tempOutput)->quantAttr; + if (nullptr != outputQuantAttr.get()) { + TensorUtils::getDescribe(tempOutput)->type = DataType_DT_INT8; + } + } + } + std::shared_ptr tmpstorage; + exe.reset(OpCommonUtils::createExecutionWithExternal(backend, info.inputs, info.outputs, op, &loader, tmpstorage)); + if (exe.get() == nullptr) { + exe.reset(OpCommonUtils::createExecutionWithExternal(backupBackend, info.inputs, info.outputs, op, &loader, tmpstorage)); + } + if (nullptr == exe) { + break; + } + // The exe can't clone + if (!exe->onClone(nullptr, op, nullptr)) { + exe = nullptr; + break; + } } if (OpParameter_Convolution2D == op_table->main.type) { op_table->main.AsConvolution2D()->bias.clear(); @@ -148,12 +191,12 @@ static bool _reshapeTensor(Tensor* tensor, const Tensor* dims) { } return dirty; } -static void _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, Schedule::TENSORCACHE* cacheTensor) { +static bool _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, Schedule::TENSORCACHE* cacheTensor) { MNN_ASSERT(nullptr != tensor); bool dirty = _reshapeTensor(tensor, dims); if (!dirty) { - return; + return false; } tensor->buffer().dimensions = (int)dims->dimensions(); @@ -172,7 +215,7 @@ static void _resizeTensor(Tensor* tensor, const Tensor* dims, Session* session, std::get<2>(*cacheTensor) = true; } } - session->setNeedResize(); + return true; } void StaticModule::resetInputOutputs() { mPrevInputTensor.resize(mResource->mInputs.size()); @@ -196,6 +239,49 @@ void StaticModule::resetInputOutputs() { des->usage = Tensor::InsideDescribe::OUTPUT; } } + // Mask Geometry Compute Mid Tensor release able indexes + auto& infos = pipelineInfo; + for (auto& info : infos.second) { + info.releaseAbleInputs.clear(); + if (info.type != Schedule::Type::CONSTANT) { + continue; + } + for (auto t : info.inputs) { + auto des = TensorUtils::getDescribe(t); + if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) { + des->useCount = 0; + } + } + } + for (auto& info : infos.second) { + for (auto t : info.inputs) { + auto des = TensorUtils::getDescribe(t); + if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) { + des->useCount++; + } + } + } + for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) { + mOutputTensors[i] = mSession->getTensor(mResource->mOutputs[mResource->mOutputFromTensor[i]]); + auto des = TensorUtils::getDescribe(mOutputTensors[i]); + if (des->usage == Tensor::InsideDescribe::CONSTANT && des->isMutable) { + des->useCount ++; + } + } + for (auto& info : infos.second) { + if (info.type != Schedule::Type::CONSTANT) { + continue; + } + for (int v=0; vusage == Tensor::InsideDescribe::CONSTANT && des->isMutable) { + des->useCount--; + if (des->useCount == 0) { + info.releaseAbleInputs.emplace_back(v); + } + } + } + } } StaticModule::StaticModule(std::vector inputs, @@ -233,7 +319,7 @@ StaticModule::StaticModule(std::vector inputs, } } if (config.rearrange) { - mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get()); + mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get(), config.base); } else { mResource->mBuffer = std::move(buffer); } @@ -300,33 +386,12 @@ void StaticModule::onClearCache() { } } } - -std::vector StaticModule::onForward(const std::vector& inputs) { - - AUTOTIME; - std::vector outputs(mResource->mOutputNumbers); - for (auto& iter : mResource->mOutputFromInput) { - outputs[iter.first] = inputs[iter.second]; - } - if (mResource->mOutputFromTensor.empty()) { - return outputs; - } - Variable::compute(inputs); -#ifdef MNN_DUMP_MEMORY - auto rt = Executor::getRuntime(); - auto mem = rt.second->onGetMemoryInMB(); - for (auto iter : rt.first) { - if (iter.second.get() != rt.second.get()) { - mem += iter.second->onGetMemoryInMB(); - } - } - FUNC_PRINT_ALL(mem, f); -#endif - - MNN_ASSERT(inputs.size() == mInputTensors.size()); +ErrorCode StaticModule::_resize(const std::vector& inputs) { + ErrorCode code = NO_ERROR; auto& pipelineInfo = mSession->getPipelineInfo(0); if (mResource->mModes.inputMode == Interpreter::Session_Input_User) { pipelineInfo.first.inputBackendChange = false; + bool needResize = mResource->mUseContentInputs; for (int i = 0; i < inputs.size(); ++i) { if (nullptr == mInputTensors[i]) { continue; @@ -366,6 +431,7 @@ std::vector StaticModule::onForward(const std::vectortensorArrayAttr.get() != nullptr) { // For tensorArray, don't need content needCopy = false; + mSession->setNeedResize(); } bool needMalloc; if (needCopy) { @@ -390,18 +456,38 @@ std::vector StaticModule::onForward(const std::vectordimensionFormat = srcDes->dimensionFormat; des->tensorArrayAttr = srcDes->tensorArrayAttr; mInputTensors[i]->buffer().type = inputTensor->buffer().type; - _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), cacheTensor); + if (_resizeTensor(mInputTensors[i], inputTensor, mSession.get(), cacheTensor)) { + needResize = true; + } if (needMalloc) { mSession->setNeedMalloc(); } } - if (mResource->mUseContentInputs) { + if (needResize) { mSession->setNeedResize(); } - auto code = mSession->resize(); - if (NO_ERROR != code) { - FUNC_PRINT(code); - return {}; + code = mSession->resize(); + if (!needResize) { + // Check if output is used by other vars. If used, must realloc output to avoid the content dirty for output vars + // If resized, the output's memory will be all released in Session::resize, don't need clear here + for (auto& output : mOutputTensors) { + auto desOrigin = TensorUtils::getDescribeOrigin(output); + if ((!desOrigin->mContent->isMutable) || nullptr == desOrigin->mem.get()) { + continue; + } + auto bn = desOrigin->getBackend(); + if (nullptr == bn) { + continue; + } + if (desOrigin->mContent.use_count() > 1 && desOrigin->mContent->usage != Tensor::InsideDescribe::CONSTANT) { + desOrigin->mem = nullptr; + auto res = bn->onAcquireBuffer(output, Backend::STATIC); + if (!res) { + return OUT_OF_MEMORY; + } + mSession->setNeedMalloc(); + } + } } } else { // Resize @@ -414,9 +500,11 @@ std::vector StaticModule::onForward(const std::vectordimensionFormat = srcDes->dimensionFormat; mInputTensors[i]->buffer().type = inputTensor->buffer().type; - _resizeTensor(mInputTensors[i], inputTensor, mSession.get(), nullptr); + if (_resizeTensor(mInputTensors[i], inputTensor, mSession.get(), nullptr)) { + mSession->setNeedResize(); + } } - mSession->resize(); + code = mSession->resize(); // Copy for (int i = 0; i < inputs.size(); ++i) { if (nullptr == mInputTensors[i]) { @@ -427,19 +515,10 @@ std::vector StaticModule::onForward(const std::vectorcopyFromHostTensor(inputTensor); } } + return code; +} - -#ifdef LOG_VERBOSE - for (auto& inputTensor : mInputTensors) { - MNN_PRINT("static module, before run, input ptr:%p, hostPtr:%p, shape:", inputTensor, inputTensor->host()); - inputTensor->printShape(); - MNN_PRINT("\n"); - auto shape = inputTensor->shape(); - } - MNN_PRINT("staticmodule before run\n"); -#endif - - +ErrorCode StaticModule::_execute() { ErrorCode code; if (mResource->mModes.callBackMode == Interpreter::Session_Debug) { auto globalExecutor = ExecutorScope::Current(); @@ -452,9 +531,58 @@ std::vector StaticModule::onForward(const std::vectorrun(); } + return code; +} + +std::vector StaticModule::onForward(const std::vector& inputs) { + + AUTOTIME; + std::vector outputs; + bool runResize = (!mShapeInferSeperate) || inputs.size() > 0; + bool runCompute = (!mShapeInferSeperate) || inputs.size() == 0; + if (runResize) { + outputs.resize(mResource->mOutputNumbers); + for (auto& iter : mResource->mOutputFromInput) { + outputs[iter.first] = inputs[iter.second]; + } + } + if (mResource->mOutputFromTensor.empty()) { + return outputs; + } + Variable::compute(inputs); +#ifdef MNN_DUMP_MEMORY + auto rt = Executor::getRuntime(); + auto mem = rt.second->onGetMemoryInMB(); + for (auto iter : rt.first) { + if (iter.second.get() != rt.second.get()) { + mem += iter.second->onGetMemoryInMB(); + } + } + FUNC_PRINT_ALL(mem, f); +#endif + + ErrorCode code = NO_ERROR; + if (runResize) { + code = _resize(inputs); + } + if (NO_ERROR == code && runCompute) { + code = _execute(); + } if (NO_ERROR != code) { + FUNC_PRINT(code); return {}; } + if (!runResize) { + for (auto& var : mOutputVars) { + // Check if needed recopy + auto inside = var->expr().first->inside(); + if (nullptr != inside->mHostTensor) { + inside->mOutputTensors[0]->copyToHostTensor(inside->mHostTensor); + } + } + return {}; + } + auto& pipelineInfo = mSession->getPipelineInfo(0); for (int i = 0; i < mOutputTensors.size(); ++i) { auto tensor = Tensor::clone(mOutputTensors[i]); outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true)); @@ -469,7 +597,9 @@ std::vector StaticModule::onForward(const std::vectormOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend; } } - + if (mShapeInferSeperate && runResize) { + mOutputVars = outputs; + } #ifdef MNN_INTERNAL_ENABLED auto glo = ExecutorScope::Current(); float flops = 0.0f; @@ -492,6 +622,7 @@ Module* StaticModule::clone(CloneContext* ctx) const { return this->cloneBaseTo(ctx, module); } int StaticModule::onOptimize(Interpreter::SessionMode stage) { + int res = 0; switch (stage) { case MNN::Interpreter::Session_Resize_Check: mSession->openResizeCheck(); @@ -499,10 +630,21 @@ int StaticModule::onOptimize(Interpreter::SessionMode stage) { case MNN::Interpreter::Session_Resize_Fix: mSession->fixResizeCache(); break; + case MNN::Interpreter::Module_Forward_Separate: + if (mResource->mUseContentInputs || mResource->mModes.inputMode != Interpreter::Session_Input_User || mResource->mOutputFromTensor.empty()) { + res = NOT_SUPPORT; + break; + } + mShapeInferSeperate = true; + break; + case MNN::Interpreter::Module_Forward_Combine: + mOutputVars.clear(); + mShapeInferSeperate = false; + break; default: break; } - return 0; + return res; } } // namespace Express diff --git a/express/module/StaticModule.hpp b/express/module/StaticModule.hpp index 3b5b8bb5d..582ae92fb 100644 --- a/express/module/StaticModule.hpp +++ b/express/module/StaticModule.hpp @@ -25,8 +25,12 @@ class StaticModule : public Module { virtual std::vector onForward(const std::vector& inputs) override; virtual void onClearCache() override; virtual int onOptimize(Interpreter::SessionMode stage) override; + const Session* getSession() const { return mSession.get(); } private: + ErrorCode _resize(const std::vector& inputs); + ErrorCode _execute(); + StaticModule() = default; void resetInputOutputs(); @@ -52,6 +56,8 @@ class StaticModule : public Module { std::vector> mPrevInputTensor; std::vector mOutputTensors; std::shared_ptr mResource; + bool mShapeInferSeperate = false; + std::vector mOutputVars; }; } } diff --git a/include/MNN/ErrorCode.hpp b/include/MNN/ErrorCode.hpp index 7ee64e6be..4d40d60e4 100644 --- a/include/MNN/ErrorCode.hpp +++ b/include/MNN/ErrorCode.hpp @@ -28,6 +28,16 @@ enum ErrorCode { // Op Resize Error TENSOR_NOT_SUPPORT = 20, TENSOR_NEED_DIVIDE = 21, + + // File error + FILE_CREATE_FAILED = 30, + FILE_REMOVE_FAILED = 31, + FILE_OPEN_FAILED = 32, + FILE_CLOSE_FAILED = 33, + FILE_RESIZE_FAILED = 34, + FILE_SEEK_FAILED = 35, + FILE_NOT_EXIST = 36, + FILE_UNMAP_FAILED = 37 }; } // namespace MNN diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp index 6debbe3f0..bac8fb341 100644 --- a/include/MNN/Interpreter.hpp +++ b/include/MNN/Interpreter.hpp @@ -163,6 +163,15 @@ class MNN_PUBLIC Interpreter { /** Dynamic Reisze Optimization */ Session_Resize_Check = 14, // Open Trace for resize Session_Resize_Fix = 15, // Apply Resize Optimization + + /** Set for Module's traceOrOptimize API. + Module_Forward_Seperate: + when inputs is not empty , Module's onForward will only infer shape and alloc memory. + when inputs is empty , Module's onForward will only runSession to compute content. + Default is Module_Forward_Combine + */ + Module_Forward_Separate = 16, + Module_Forward_Combine = 17, }; /** * @brief The API shoud be called before create session. @@ -220,6 +229,17 @@ class MNN_PUBLIC Interpreter { // 2: Only quantize value cache, use fp8 quantization // 3: quantize both key and value cache as described above KVCACHE_QUANT_OPTIONS = 7, + + // size limit of kvcache in memory (for a single layer) + // if the size of kvcache exceeds the limit, it will be moved to disk + KVCACHE_SIZE_LIMIT = 8, + }; + + enum ExternalPathType { + // Path of the kvcache directory + EXTERNAL_PATH_KVCACHE_DIR = 0, + + // Other types ... }; enum GeometryComputeMask { diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h index ab84cd8f8..215939a99 100644 --- a/include/MNN/MNNDefine.h +++ b/include/MNN/MNNDefine.h @@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \ #define STR(x) STR_IMP(x) #define MNN_VERSION_MAJOR 2 #define MNN_VERSION_MINOR 9 -#define MNN_VERSION_PATCH 3 +#define MNN_VERSION_PATCH 4 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH) #endif /* MNNDefine_h */ diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp index 3871827c9..367c15d03 100644 --- a/include/MNN/expr/Executor.hpp +++ b/include/MNN/expr/Executor.hpp @@ -103,6 +103,13 @@ class MNN_PUBLIC Executor { */ void setCache(std::string cacheName); + /** + * @brief set the path of external files or directory + * @param path -- The path of a file or directory on disk + * @param type -- Type of the external path (see "enum ExternalPathType" in Interpreter.hpp) + */ + void setExternalPath(std::string path, int type); + /** * @brief set external file. */ diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp index 2436b3985..1e5562de8 100644 --- a/include/MNN/expr/Module.hpp +++ b/include/MNN/expr/Module.hpp @@ -17,6 +17,7 @@ #include namespace MNN { +class Session; namespace Express { struct SubGraph; class MNN_PUBLIC Module { @@ -47,7 +48,7 @@ class MNN_PUBLIC Module { void setParameter(Express::VARP parameter, int index); static Module* createEmpty(const std::vector& parameters); - + struct BackendInfo { MNNForwardType type = MNN_FORWARD_CPU; BackendConfig* config = nullptr; @@ -63,8 +64,11 @@ class MNN_PUBLIC Module { // The weights will be rearranged in a general way, so the best implementation // may not be adopted if `rearrange` is enabled. bool rearrange = false; - + BackendInfo* backend = nullptr; + + // base module + const Module* base = nullptr; }; static Module* load(const std::vector& inputs, const std::vector& outputs, const uint8_t* buffer, size_t length, const Config* config = nullptr); static Module* load(const std::vector& inputs, const std::vector& outputs, const char* fileName, const Config* config = nullptr); @@ -102,7 +106,6 @@ class MNN_PUBLIC Module { EXPRP getOrClone(const EXPRP expr); VARP getOrClone(const VARP var); - private: bool mShareParams = false; std::unordered_map mExprMap; @@ -117,6 +120,7 @@ class MNN_PUBLIC Module { static void destroy(Module* m); int traceOrOptimize(Interpreter::SessionMode stage); + std::vector> getChildren() const { return mChildren; } protected: virtual int onOptimize(Interpreter::SessionMode stage) { return 0; diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj index 009adba67..f576703bf 100644 --- a/project/ios/MNN.xcodeproj/project.pbxproj +++ b/project/ios/MNN.xcodeproj/project.pbxproj @@ -743,6 +743,8 @@ 9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9560EAD52BDE426A00C8D0B6 /* GeometryLayernorm.cpp */; }; 956F52E12AB2D692004B13D9 /* ImageProcessUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 956F52E02AB2D692004B13D9 /* ImageProcessUtils.cpp */; }; 956F52E32AB2D6A1004B13D9 /* ImageProcessUtils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 956F52E22AB2D6A1004B13D9 /* ImageProcessUtils.hpp */; }; + 95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */; }; + 95772DD02C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S in Sources */ = {isa = PBXBuildFile; fileRef = 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */; }; 958375352A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S in Sources */ = {isa = PBXBuildFile; fileRef = 958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */; }; 958B046429D2C89D00FC3AEF /* GemmInt8Executor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */; }; 958B046629D2C8AF00FC3AEF /* GemmInt8Executor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */; }; @@ -1577,6 +1579,8 @@ 9560EAD52BDE426A00C8D0B6 /* GeometryLayernorm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryLayernorm.cpp; sourceTree = ""; }; 956F52E02AB2D692004B13D9 /* ImageProcessUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ImageProcessUtils.cpp; sourceTree = ""; }; 956F52E22AB2D6A1004B13D9 /* ImageProcessUtils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ImageProcessUtils.hpp; sourceTree = ""; }; + 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM82.S; sourceTree = ""; }; + 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNPackC4Int8ForMatMulA_ARM86.S; sourceTree = ""; }; 958375342A496E5C007C0A3E /* MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; path = arm/arm64/MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3.S; sourceTree = ""; }; 958B046329D2C89D00FC3AEF /* GemmInt8Executor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GemmInt8Executor.cpp; sourceTree = ""; }; 958B046529D2C8AF00FC3AEF /* GemmInt8Executor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = GemmInt8Executor.hpp; sourceTree = ""; }; @@ -1853,6 +1857,7 @@ 488873A8215B639D0079B12E /* source */ = { isa = PBXGroup; children = ( + CE482EF5288536DA007CD935 /* internal */, 4DF87C482887D3560003E2D4 /* calib3d */, 4D4CF4612760946500A36D9F /* imgproc */, 4D9A931B26255BDA00F9B43C /* coreml */, @@ -2591,6 +2596,8 @@ 92FF017C23AA0B4E00AC97F6 /* arm64 */ = { isa = PBXGroup; children = ( + 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */, + 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */, 4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */, 95CE1E002AC57F7600EFB51E /* MNNReluWithSlopeChannelInt8.S */, CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */, @@ -2884,16 +2891,19 @@ CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */, 4DE4E82C275E307B0016A916 /* cv in Headers */, 1F501F842397BA5B004E8721 /* ImageProcess.hpp in Headers */, + CECF8C5D299CACFD00D3875B /* Log.hpp in Headers */, 1F501F822397BA5B004E8721 /* Interpreter.hpp in Headers */, C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */, 1F501F882397BA5B004E8721 /* Tensor.hpp in Headers */, 1F501F872397BA5B004E8721 /* Matrix.h in Headers */, CE8049AC2B31C65B009B422C /* CPULayerNorm.hpp in Headers */, + CECF8C5A299CACFD00D3875B /* WorkerThread.hpp in Headers */, 48C84B85250F711700EE7666 /* IfModule.hpp in Headers */, 4D9A937326255BDA00F9B43C /* CoreMLUnary.hpp in Headers */, 48C84B98250F71E900EE7666 /* CPUSoftmax.hpp in Headers */, 4882C8B8241A22B800DAC168 /* OpCommonUtils.hpp in Headers */, 48608B54250632EC00CB1D71 /* GeometryComputer.hpp in Headers */, + CECF8C7A299CAD9400D3875B /* sha1.h in Headers */, 4894C6EC27016F7200D8BE79 /* CPUResizeCache.hpp in Headers */, 92FF04A623AA0BFB00AC97F6 /* FileLoader.hpp in Headers */, 48F34733273A7C8400C45394 /* ImageProcessFunction.hpp in Headers */, @@ -2907,6 +2917,7 @@ 48925F352744AC0700919B37 /* CPUROIAlign.hpp in Headers */, 92FF029623AA0B5A00AC97F6 /* CPUCast.hpp in Headers */, 4D9A937826255BDA00F9B43C /* CoreMLBinary.hpp in Headers */, + CECF8C85299CAD9400D3875B /* log_util.h in Headers */, 4D6D7FD52656896600F80814 /* DenseConvolutionTiledExecutor.hpp in Headers */, 4D9A936626255BDA00F9B43C /* CoreMLExecutor.h in Headers */, 92FF027A23AA0B5A00AC97F6 /* CPUPool.hpp in Headers */, @@ -2915,6 +2926,7 @@ 1F501F802397BA5B004E8721 /* MNNDefine.h in Headers */, 19D0FE76285C66F200B74B1A /* MetalLayerNorm.hpp in Headers */, 489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */, + CECF8C86299CAD9400D3875B /* sds.h in Headers */, 1F501F7F2397BA5B004E8721 /* HalideRuntime.h in Headers */, 92FF029E23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.hpp in Headers */, 4D9A935B26255BDA00F9B43C /* NeuralNetwork.pb-c.h in Headers */, @@ -2935,8 +2947,10 @@ 481C2DEE25FE2CD6001ED6DF /* Arm82Functions.hpp in Headers */, 4894C6EA27016F7200D8BE79 /* UnaryUtils.hpp in Headers */, EBD4842A2485FF650083CE95 /* Arm82Interp.hpp in Headers */, + CECF8C81299CAD9400D3875B /* log_util_imp.h in Headers */, 92FF037623AA0B5A00AC97F6 /* CPUBinary.hpp in Headers */, 4D9A935826255BDA00F9B43C /* FeatureTypes.pb-c.h in Headers */, + CECF8C7C299CAD9400D3875B /* hmac-sha.h in Headers */, 48608B53250632EC00CB1D71 /* GeometryComputerUtils.hpp in Headers */, 950B28F529F629A90002F454 /* CPUBinaryInt8.hpp in Headers */, 489D7A732550FDC800AD896A /* MetalBackend.hpp in Headers */, @@ -2959,6 +2973,7 @@ 4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */, 48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */, 92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */, + CECF8C77299CAD9400D3875B /* log_builder.h in Headers */, 4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */, 92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */, 4AF4FB26269ED235005BA97B /* SparseConvInt8TiledExecutor.hpp in Headers */, @@ -2996,6 +3011,7 @@ 92FF03CA23AA0B5A00AC97F6 /* CPUConvolutionDepthwise.hpp in Headers */, 92FF04A923AA0BFB00AC97F6 /* Schedule.hpp in Headers */, 489D7A9F2550FDC900AD896A /* MetalConvolutionCommon.hpp in Headers */, + CECF8C80299CAD9400D3875B /* lz4.h in Headers */, 92FF028623AA0B5A00AC97F6 /* CPUDeconvolution.hpp in Headers */, 489D7A722550FDC800AD896A /* MetalReLU6.hpp in Headers */, 92FF04B523AA0BFB00AC97F6 /* TensorUtils.hpp in Headers */, @@ -3047,20 +3063,24 @@ 92FF03A623AA0B5A00AC97F6 /* ConvolutionTiledExecutor.hpp in Headers */, 92FF036523AA0B5A00AC97F6 /* CPUResize.hpp in Headers */, 92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */, + CECF8C88299CAD9400D3875B /* log_api.h in Headers */, 4A224A0D27D0C2D9000A9260 /* ConvolutionPackWinograd.hpp in Headers */, 4A224A0E27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.hpp in Headers */, 4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */, 48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */, F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */, + CECF8C5B299CACFD00D3875B /* LogHelper.hpp in Headers */, 92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */, 482BFBCD28351BA1009210E4 /* ShaderMap.hpp in Headers */, 489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */, + CECF8C7F299CAD9400D3875B /* md5.h in Headers */, 92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */, 92FF028023AA0B5A00AC97F6 /* CPUFloatToInt8.hpp in Headers */, 92FF028723AA0B5A00AC97F6 /* CPUFixedPoint.hpp in Headers */, C43C8227251894F400A0FF84 /* Vec.hpp in Headers */, 4819FB1D24C138DF0050BD09 /* GeometryConvUtils.hpp in Headers */, 489D7A952550FDC900AD896A /* MetalMatMul.hpp in Headers */, + CECF8C83299CAD9400D3875B /* log_define.h in Headers */, C48CAE2628900C4A00271A6D /* ConvInt8Winograd.hpp in Headers */, 48F34730273A7C7300C45394 /* CPUImageProcess.hpp in Headers */, 489D7A702550FDC800AD896A /* MetalRaster.hpp in Headers */, @@ -3281,6 +3301,7 @@ 489D7A8A2550FDC900AD896A /* MetalConvolutionDepthwise.mm in Sources */, 48123003269EA83400EB7ABA /* ShapeUnique.cpp in Sources */, 92FF037D23AA0B5A00AC97F6 /* CPURelu.cpp in Sources */, + CECF8C5E299CACFD00D3875B /* WorkerThread.cpp in Sources */, 489D7A842550FDC900AD896A /* MetalBinary.mm in Sources */, 48747D6B245D9E33000B9709 /* GeometryFill.cpp in Sources */, 4819FB1F24C138DF0050BD09 /* GeometryConvUtils.cpp in Sources */, @@ -3380,6 +3401,7 @@ 48F34734273A7C8400C45394 /* ImageProcessFunction.cpp in Sources */, 6A131E4025823349002EC3D6 /* PluginKernel.cpp in Sources */, 48958781268EBA6F00EA01A7 /* CPUSegmentMean.cpp in Sources */, + CECF8C7B299CAD9400D3875B /* sha1.c in Sources */, 4D9A937026255BDA00F9B43C /* CoreMLUnary.cpp in Sources */, 92FF04A823AA0BFB00AC97F6 /* AutoTime.cpp in Sources */, 92FF04AE23AA0BFB00AC97F6 /* Backend.cpp in Sources */, @@ -3434,6 +3456,7 @@ 92FF03CE23AA0B5A00AC97F6 /* CPUOPRegister.cpp in Sources */, 92FF02B323AA0B5A00AC97F6 /* CPUInstanceNorm.cpp in Sources */, 4819FB2C24C1396A0050BD09 /* GeometryPoolGrad.cpp in Sources */, + CECF8C7E299CAD9400D3875B /* log_builder.cpp in Sources */, 92FF042223AA0B7100AC97F6 /* ShapeConcat.cpp in Sources */, 4D6D7FD12656891400F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */, 4D5662CC299B76ED0031C1A1 /* MNNMaxPoolInt8.S in Sources */, @@ -3500,6 +3523,7 @@ 92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */, 48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */, 92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */, + 95772DCF2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S in Sources */, 92FF02E623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */, 48747D64245D9E33000B9709 /* GeometryTile.cpp in Sources */, 92FF043723AA0B7100AC97F6 /* ShapeDetectionOutput.cpp in Sources */, @@ -3512,6 +3536,7 @@ 4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */, 11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */, 48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */, + CECF8C7D299CAD9400D3875B /* md5.c in Sources */, 92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */, 92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */, CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */, @@ -3569,8 +3594,10 @@ 92FF042E23AA0B7100AC97F6 /* ShapeProposal.cpp in Sources */, 92FF025923AA0B5A00AC97F6 /* CPUPoolInt8.cpp in Sources */, 92FF045B23AA0B7100AC97F6 /* ShapeShape.cpp in Sources */, + CECF8C87299CAD9400D3875B /* sds.c in Sources */, 9560EAD62BDE426A00C8D0B6 /* GeometryLayernorm.cpp in Sources */, 4D6D7FD72656896D00F80814 /* SparseConvolutionTiledExecutor.cpp in Sources */, + CECF8C82299CAD9400D3875B /* log_api.cpp in Sources */, 92FF03A823AA0B5A00AC97F6 /* WinogradOptFunction.cpp in Sources */, 950B28E229F627E00002F454 /* MNNBinarySubInt8.S in Sources */, 950B28F029F627F70002F454 /* MNNBinarySubInt8.S in Sources */, @@ -3580,6 +3607,7 @@ 4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */, CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */, C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */, + CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */, 48FA474523AA127B00172C3B /* Executor.cpp in Sources */, 92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */, 48A8A61A21D101DE00C2B9A7 /* Matrix_CV.cpp in Sources */, @@ -3605,6 +3633,7 @@ 92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */, EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */, 92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */, + CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */, 92FF045623AA0B7100AC97F6 /* ShapeReshape.cpp in Sources */, 92FF032523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */, 92FF044423AA0B7100AC97F6 /* ShapeLSTM.cpp in Sources */, @@ -3641,6 +3670,7 @@ 92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */, 92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */, CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */, + CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */, 92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */, 952298B22B4D39050043978B /* MetalLoop.mm in Sources */, 48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */, @@ -3666,11 +3696,13 @@ 92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */, 4D9A937926255BDA00F9B43C /* CoreMLRaster.cpp in Sources */, 48417FF224D13BF50056D9A7 /* GeometrySelect.cpp in Sources */, + CECF8C84299CAD9400D3875B /* lz4.c in Sources */, 489D7A7E2550FDC900AD896A /* MNNMetalContext.mm in Sources */, 92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */, 92FF036B23AA0B5A00AC97F6 /* CPUResize.cpp in Sources */, 92FF02C723AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */, 92FF030923AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */, + CECF8C79299CAD9400D3875B /* hmac-sha.cpp in Sources */, 92FF032623AA0B5A00AC97F6 /* MNNWinogradMatrixProductLeft.S in Sources */, 92FF04C023AA0BFB00AC97F6 /* Tensor.cpp in Sources */, CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */, @@ -3720,6 +3752,7 @@ 92FF043623AA0B7100AC97F6 /* ShapeSelect.cpp in Sources */, 92FF042B23AA0B7100AC97F6 /* ShapeOneHot.cpp in Sources */, 92FF043C23AA0B7100AC97F6 /* ShapeExpandDims.cpp in Sources */, + 95772DD02C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S in Sources */, 92FF045723AA0B7100AC97F6 /* ShapeTranspose.cpp in Sources */, 92FF031023AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */, 92FF032A23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */, @@ -4068,7 +4101,7 @@ CODE_SIGN_STYLE = Automatic; DEAD_CODE_STRIPPING = YES; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = 6G7464HHUS; + DEVELOPMENT_TEAM = Q48UX93J22; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -4155,7 +4188,7 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage; CODE_SIGN_STYLE = Automatic; - DEVELOPMENT_TEAM = 6G7464HHUS; + DEVELOPMENT_TEAM = Q48UX93J22; GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; HEADER_SEARCH_PATHS = ( @@ -4170,7 +4203,7 @@ IPHONEOS_DEPLOYMENT_TARGET = 9.0; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)"; - PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj; + PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdedddddd; PRODUCT_NAME = "$(TARGET_NAME)"; TARGETED_DEVICE_FAMILY = "1,2"; }; @@ -4202,7 +4235,7 @@ MARKETING_VERSION = 1.0; MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; MTL_FAST_MATH = YES; - PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.ddddddddd; + PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; TARGETED_DEVICE_FAMILY = "1,2"; @@ -4234,7 +4267,7 @@ LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; MARKETING_VERSION = 1.0; MTL_FAST_MATH = YES; - PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.ddddddddd; + PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_EMIT_LOC_STRINGS = YES; TARGETED_DEVICE_FAMILY = "1,2"; @@ -4284,3 +4317,4 @@ }; rootObject = 0F1465AE1FA18D1000F9860A /* Project object */; } + diff --git a/project/ios/Playground/AppDelegate.mm b/project/ios/Playground/AppDelegate.mm index d073b12a8..7efc31eae 100644 --- a/project/ios/Playground/AppDelegate.mm +++ b/project/ios/Playground/AppDelegate.mm @@ -11,8 +11,10 @@ #include #include #import +#define MNN_OPEN_TIME_TRACE +#include #import "benchmark.h" -#define TEST_WORKMODE 0 +#define TEST_WORKMODE 2 @implementation AppDelegate - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { @@ -44,11 +46,55 @@ - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:( auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle); CFRelease(url); auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); - auto res = std::string(cstring) + "/models/mobilenet_v2_auth.mnn"; + auto res = std::string(cstring) + "/model/MobileNet/v1/mobilenet_v1.caffe.mnn"; + CFRelease(string); MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str()); + interpreter->setSessionHint(MNN::Interpreter::GEOMETRY_COMPUTE_MASK, 0); MNN::ScheduleConfig config; - interpreter->createSession(config); + config.type = MNN_FORWARD_NN; + config.numThread = 1; + MNN::BackendConfig bnC; + bnC.precision = MNN::BackendConfig::Precision_Normal; + config.backendConfig = &bnC; + auto session = interpreter->createSession(config); + auto inpDev = interpreter->getSessionInput(session, nullptr); + auto outDev = interpreter->getSessionOutput(session, nullptr); + auto input = std::shared_ptr(new MNN::Tensor(inpDev)); + auto output = std::shared_ptr(new MNN::Tensor(outDev)); + auto inputHost = input->host(); + int inputSize = input->elementSize(); + for (int v=0; vhost(); + int outputSize = output->elementSize(); + + for (int i=0; i<2; ++i) { + inpDev->copyFromHostTensor(input.get()); + interpreter->runSession(session); + outDev->copyToHostTensor(output.get()); + float sum = 0.0f; + float maxv = 0.0f; + float minv = 0.0f; + for (int v=0; vcopyFromHostTensor(input.get()); + interpreter->runSession(session); + outDev->copyToHostTensor(output.get()); + } + } + delete interpreter; #endif return YES; } diff --git a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py index 79c3bf9fd..a4017f913 100644 --- a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py +++ b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py @@ -14,7 +14,6 @@ def load_feature_extractor(model_file): output_var = var_dict['MobilenetV2/Logits/AvgPool'] # 'False' means the parameters int this module will not update during training feature_extractor = nn.load_module([input_var], [output_var], False) - feature_extractor = nn.FixModule(feature_extractor) # fix feature extractor return feature_extractor diff --git a/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py b/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py deleted file mode 100644 index a2f4cb0a3..000000000 --- a/pymnn/examples/MNNTrain/quantization_aware_training/imagenet_dataset.py +++ /dev/null @@ -1,97 +0,0 @@ -import numpy as np -from PIL import Image -import MNN -F = MNN.expr - - -# adapted from pycaffe -def load_image(filename, color=True): - """ - Load an image converting from grayscale or alpha as needed. - - Parameters - ---------- - filename : string - color : boolean - flag for color format. True (default) loads as RGB while False - loads as intensity (if image is already grayscale). - - Returns - ------- - image : an image with type np.float32 in range [0, 1] - of size (H x W x 3) in RGB or - of size (H x W x 1) in grayscale. - """ - img = Image.open(filename) - img = np.array(img) - if img.ndim == 2: - img = img[:, :, np.newaxis] - if color: - img = np.tile(img, (1, 1, 3)) - elif img.shape[2] == 4: - img = img[:, :, :3] - return img - - -def center_crop(image_data, crop_factor): - height, width, channels = image_data.shape - - h_size = int(height * crop_factor) - h_start = int((height - h_size) / 2) - h_end = h_start + h_size - - w_size = int(width * crop_factor) - w_start = int((width - w_size) / 2) - w_end = w_start + w_size - - cropped_image = image_data[h_start:h_end, w_start:w_end, :] - - return cropped_image - - -def resize_image(image, shape): - im = Image.fromarray(image) - im = im.resize(shape) - resized_image = np.array(im) - - return resized_image - - -class ImagenetDataset(MNN.data.Dataset): - def __init__(self, image_folder, val_txt, training_dataset=True): - super(ImagenetDataset, self).__init__() - self.is_training_dataset = training_dataset - - self.image_folder = image_folder - - if self.is_training_dataset: - f = open(val_txt) - self.image_list = f.readlines()[0:10000] - f.close() - else: - f = open(val_txt) - self.image_list = f.readlines()[10000:50000] - f.close() - - def __getitem__(self, index): - image_name = self.image_folder + self.image_list[index].split(' ')[0] - image_label = int(self.image_list[index].split(' ')[1]) + 1 # align with tf mobilenet labels, we need add 1 - - image_data = load_image(image_name) - image_data = center_crop(image_data, 0.85) - image_data = resize_image(image_data, (224, 224)) - - image_data = (image_data - 127.5) / 127.5 - - dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC) - dl = F.const([image_label], [], F.data_format.NHWC, F.dtype.int) - # first for inputs, and may have many inputs, so it's a list - # second for targets, also, there may be more than one targets - return [dv], [dl] - - def __len__(self): - # size of the dataset - if self.is_training_dataset: - return 10000 - else: - return 40000 diff --git a/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py b/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py deleted file mode 100644 index ac6e32f05..000000000 --- a/pymnn/examples/MNNTrain/quantization_aware_training/quant_aware_training.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import print_function -import time -import argparse -import numpy as np -import MNN -from imagenet_dataset import ImagenetDataset -nn = MNN.nn -F = MNN.expr - - -def test_func(net, test_dataloader): - net.train(False) - test_dataloader.reset() - - correct = 0 - total = 0 - for i in range(test_dataloader.iter_number): - example = test_dataloader.next() - input_data = example[0] - output_target = example[1] - data = input_data[0] # which input, model may have more than one inputs - label = output_target[0] # also, model may have more than one outputs - - predict = net(data) - predict = F.argmax(predict, 1) - predict = np.array(predict.read()) - label = np.array(label.read()) - correct += (np.sum(label == predict)) - total += label.size - - if (i+1) % 10 == 0: - print("test iteration", i+1, ", accuracy: ", correct / total * 100, "%") - - print("test acc: ", correct * 100.0 / test_dataloader.size, "%") - - -def train_func(net, train_dataloader, opt, num_classes): - net.train(True) - train_dataloader.reset() - - t0 = time.time() - # for i in range(train_dataloader.iter_number): - for i in range(100): # actually, in our full experiment, we only need 3K images using ILSVRC2012 training dataset - example = train_dataloader.next() - input_data = example[0] - output_target = example[1] - data = input_data[0] # which input, model may have more than one inputs - label = output_target[0] # also, model may have more than one outputs - - predict = net.forward(data) - target = F.one_hot(F.cast(label, F.int), num_classes, 1, 0) - loss = nn.loss.cross_entropy(predict, target) - opt.step(loss) - - if i % 10 == 0: - print("train loss: ", loss.read()) - - t1 = time.time() - cost = t1 - t0 - print("Epoch cost: %.3f s." % cost) - F.save(net.parameters, "temp.mobilenet.snapshot") - - -def demo(): - ''' - demo for quant-aware-training using tf mobilenet v2. - the dataset used is the ILSVRC2012 validation dataset which has 50000 images - 10000 for training (actually we only need 3K in our standard experiment using ILSVRC2012 training dataset) - 40000 for testing - ''' - parser = argparse.ArgumentParser() - parser.add_argument("--model_file", type=str, required=True,\ - help="mobilenet MNN model file") - parser.add_argument("--val_image_path", type=str, required=True,\ - help="path to ILSVRC2012 val images") - parser.add_argument("--val_txt", type=str, required=True,\ - help="path to ILSVRC2012 val.txt") - - args = parser.parse_args() - - model_file = args.model_file - image_path = args.val_image_path - val_txt = args.val_txt - - train_dataset = ImagenetDataset(image_path, val_txt, True) - test_dataset = ImagenetDataset(image_path, val_txt, False) - train_dataloader = MNN.data.DataLoader(train_dataset, batch_size=32, shuffle=True) - test_dataloader = MNN.data.DataLoader(test_dataset, batch_size=10, shuffle=False) - - m = F.load_as_dict(model_file) - - inputs_outputs = F.get_inputs_and_outputs(m) - for key in inputs_outputs[0].keys(): - print('input names:\t', key) - for key in inputs_outputs[1].keys(): - print('output names:\t', key) - - # get inputs and outputs - inputs = [m['input']] - outputs = [m['MobilenetV2/Predictions/Reshape_1']] - - net = nn.load_module(inputs, outputs, True) - - # turn net to quant-aware-training module - nn.compress.train_quant(net, quant_bits=8) - - opt = MNN.optim.SGD(net, 1e-5, 0.9, 0.00004) - - num_classes = 1001 - - for epoch in range(5): - train_func(net, train_dataloader, opt, num_classes) - - # save model - file_name = '%d.mobilenet.mnn' % epoch - net.train(False) - predict = net.forward(F.placeholder([1, 3, 224, 224], F.NC4HW4)) - print("Save to " + file_name) - F.save([predict], file_name) - - test_func(net, test_dataloader) - - -if __name__ == "__main__": - demo() diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py index 8d0297c90..320975bf5 100644 --- a/pymnn/pip_package/build_deps.py +++ b/pymnn/pip_package/build_deps.py @@ -118,7 +118,7 @@ def build_deps(): os.system('cmake ' + extra_opts + '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \ -DMNN_BUILD_SHARED_LIBS=ON -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF\ -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \ - .. && make MNN MNNConvertDeps -j4') + .. && make MNN MNNConvertDeps -j64') ################################################################################ # Building dependent libraries ################################################################################ diff --git a/pymnn/pip_package/pyproject.toml b/pymnn/pip_package/pyproject.toml index 57cc5503b..c178a4ebc 100644 --- a/pymnn/pip_package/pyproject.toml +++ b/pymnn/pip_package/pyproject.toml @@ -11,12 +11,11 @@ build-backend = "setuptools.build_meta" [tool.cibuildwheel] test-skip = [ - "*", "cp36-*", "*-macosx_arm64" ] test-requires = [ - "opencv-python", + "opencv-python==4.6.0.66", "numpy", "torch" ] @@ -42,8 +41,6 @@ repair-wheel-command = "" build = "cp*-manylinux*" skip = "pp*" before-all = [ - "sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*", - "sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://mirrors.aliyun.com|g' /etc/yum.repos.d/CentOS-*", "yum install -y wget", "pushd pymnn/pip_package", "python3 build_deps.py --torch", diff --git a/pymnn/test/model_test.py b/pymnn/test/model_test.py index d25f7f619..1da4e85f0 100644 --- a/pymnn/test/model_test.py +++ b/pymnn/test/model_test.py @@ -90,10 +90,16 @@ def createTensor(tensor, file=''): data = loadtxt(file, shape, dtype) return MNN.Tensor(shape, tensor.getDataType(), data, tensor.getDimensionType()) -def compareTensor(tensor, file, atol=5e-2): +def compareTensor(tensor, file, tolerance=5e-2): outputNumpyData = tensor.getNumpyData() expectNumpyData = loadtxt(file, tensor.getShape()) - return np.allclose(outputNumpyData, expectNumpyData, atol=atol) + max_abs_dif = np.abs(outputNumpyData - expectNumpyData).max() + max_exp_val = np.abs(expectNumpyData).max() + diff_rate = max_abs_dif / max_exp_val + if diff_rate > tolerance: + print(f'# Error: max_abs_dif: {max_abs_dif}, max_exp_val: {max_exp_val}, diff_rate: {diff_rate}') + return False + return True def log_result(success, model): global total_num @@ -240,3 +246,6 @@ def testPymnnConfig(model_root_dir): for wrong in wrongs: print(wrong) print('TEST_NAME_PYMNN_MODEL: Pymnn模型测试\nTEST_CASE_AMOUNT_PYMNN_MODEL: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(wrongs), total_num - len(wrongs))) + print('TEST_CASE={\"name\":\"Pymnn模型测试\",\"failed\":%d,\"passed\":%d}\n'%(len(wrongs), total_num - len(wrongs))) + if len(wrongs) > 0: + exit(1) diff --git a/pymnn/test/unit_test.py b/pymnn/test/unit_test.py index d9addf734..853c78961 100644 --- a/pymnn/test/unit_test.py +++ b/pymnn/test/unit_test.py @@ -45,6 +45,7 @@ def tearDownClass(cls): skipped = len(cls.skipped) try: print('\nTEST_NAME_PYMNN_UNIT: Pymnn单元测试\nTEST_CASE_AMOUNT_PYMNN_UNIT: {\"blocked\":%d,\"failed\":%d,\"passed\":%d,\"skipped\":%d}\n'%(blocked, failed, passed, skipped)) + print('\nTEST_CASE={\"name\":\"Pymnn单元测试\",\"failed\":%d,\"passed\":%d}\n'%(failed, passed)) except: print('\nTEST_NAME_PYMNN_UNIT: PymnnUnitTest\nTEST_CASE_AMOUNT_PYMNN_UNIT: {\"blocked\":%d,\"failed\":%d,\"passed\":%d,\"skipped\":%d}\n'%(blocked, failed, passed, skipped)) def run(self, result=None): diff --git a/pymnn/update_mnn_wrapper_assets.sh b/pymnn/update_mnn_wrapper_assets.sh index d0cbedc47..cb476cafb 100755 --- a/pymnn/update_mnn_wrapper_assets.sh +++ b/pymnn/update_mnn_wrapper_assets.sh @@ -43,7 +43,7 @@ find . -name __pycache__ | xargs rm -rf if cmdExist pyenv; then pyenv global $py_version fi -python -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)" +python2 -c "import compileall; compileall.compile_dir('/tmp/mnn_py/MNN', force=True)" find . -name "*.py" | xargs rm -rf cd .. zip -r MNN.zip MNN diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp index 19038ec94..2e4e9dc6b 100644 --- a/source/backend/arm82/Arm82Functions.cpp +++ b/source/backend/arm82/Arm82Functions.cpp @@ -691,6 +691,11 @@ bool Arm82Functions::init() { #define FUNC_PTR_ASSIGN(dst, src) dst = (decltype(dst))(src) gInstance = new CoreFunctions; + FUNC_PTR_ASSIGN(gInstance->MNNFp32ToFp8, MNNFp32ToFp8); + FUNC_PTR_ASSIGN(gInstance->MNNFp16ToFp8, MNNFp16ToFp8); + FUNC_PTR_ASSIGN(gInstance->MNNFp8ToFp32, MNNFp8ToFp32); + FUNC_PTR_ASSIGN(gInstance->MNNFp8ToFp16, MNNFp8ToFp16); + FUNC_PTR_ASSIGN(gInstance->MNNFp32ToLowp, MNNQuantizeFP16); FUNC_PTR_ASSIGN(gInstance->MNNLowpToFp32, MNNDequantizeFP16); gInstance->bytes = 2; diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S b/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S index 680e6f2ac..7c32ca912 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNCountMinMax_ARM82.S @@ -116,7 +116,7 @@ stp d8, d9, [sp, #(16 * 3)] Start: ld1 {v31.8h}, [x0], #16 sub x3, x3, #1 -mov v30.8h, v31.8h // v30:min v31:max +mov v30.16b, v31.16b // mov v30.8h, v31.8h // v30:min v31:max TILE_24: diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S index 118b4f104..ae6dd794b 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S @@ -139,9 +139,9 @@ LoopH: sxtl v1.8h, v2.8b scvtf v0.8h, v0.8h scvtf v1.8h, v1.8h - mov v2.8h, v7.8h + mov v2.16b, v7.16b // mov v2.8h, v7.8h fmla v2.8h, v1.8h, v5.8h - mov v1.8h, v6.8h + mov v1.16b, v6.16b // mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h cbnz x19, LH8_BLOCK_GT_0 @@ -187,9 +187,9 @@ LoopH: sxtl v1.8h, v2.8b scvtf v0.8h, v0.8h scvtf v1.8h, v1.8h - mov v2.8h, v7.8h + mov v2.16b, v7.16b // mov v2.8h, v7.8h fmla v2.8h, v1.8h, v5.8h - mov v1.8h, v6.8h + mov v1.16b, v6.16b // mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h ld1 {v0.8h}, [x15], #16 @@ -254,7 +254,7 @@ LoopHRemain: sxtl v3.8h, v3.8b scvtf v6.8h, v3.8h - mov v3.8h, v21.8h + mov v3.16b, v21.16b // mov v3.8h, v21.8h fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 @@ -303,7 +303,7 @@ LoopHRemain: sxtl v3.8h, v3.8b scvtf v6.8h, v3.8h - mov v3.8h, v21.8h + mov v3.16b, v21.16b // mov v3.8h, v21.8h fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S index 8f92ac238..52ab11e13 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S @@ -132,9 +132,9 @@ LoopH: sxtl2 v1.8h, v2.16b scvtf v0.8h, v0.8h scvtf v1.8h, v1.8h - mov v2.8h, v7.8h + mov v2.16b, v7.16b // mov v2.8h, v7.8h fmla v2.8h, v1.8h, v5.8h - mov v1.8h, v6.8h + mov v1.16b, v6.16b // mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h cbnz x19, LH8_BLOCK_GT_0 @@ -174,9 +174,9 @@ LoopH: sxtl2 v1.8h, v2.16b scvtf v0.8h, v0.8h scvtf v1.8h, v1.8h - mov v2.8h, v7.8h + mov v2.16b, v7.16b // mov v2.8h, v7.8h fmla v2.8h, v1.8h, v5.8h - mov v1.8h, v6.8h + mov v1.16b, v6.16b // mov v1.8h, v6.8h fmla v1.8h, v0.8h, v4.8h ld1 {v0.8h}, [x15], #16 @@ -235,7 +235,7 @@ LoopHRemain: ld1 {v3.16b}, [x2], #16 sxtl v3.8h, v3.8b scvtf v6.8h, v3.8h - mov v3.8h, v21.8h + mov v3.16b, v21.16b // mov v3.8h, v21.8h fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 @@ -278,7 +278,7 @@ LoopHRemain: ld1 {v3.16b}, [x2], #16 sxtl v0.8h, v3.8b scvtf v6.8h, v0.8h - mov v3.8h, v21.8h + mov v3.16b, v21.16b // mov v3.8h, v21.8h fmla v3.8h, v6.8h, v20.8h ld1 {v0.4h, v1.4h, v2.4h}, [x15], #24 diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S index 3949f7414..f23d2902c 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S @@ -85,8 +85,8 @@ LoopE8: sxtl v2.8h, v4.8b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -158,8 +158,8 @@ LoopE8: scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -276,7 +276,7 @@ LoopE8: sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 @@ -316,7 +316,7 @@ LoopE8: sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 @@ -410,8 +410,8 @@ blt E1 sxtl v2.8h, v4.8b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -459,8 +459,8 @@ blt E1 scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -541,7 +541,7 @@ blt E1 sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 @@ -570,7 +570,7 @@ blt E1 sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 @@ -647,8 +647,8 @@ LoopE1: sxtl v2.8h, v4.8b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -680,8 +680,8 @@ LoopE1: sxtl v2.8h, v4.8b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -734,7 +734,7 @@ LoopE1: sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.h}[0], [x15], x11 cbnz x26, LE1H4_BLOCK_GT_0 @@ -756,7 +756,7 @@ LoopE1: sxtl v1.8h, v3.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.h}[0], [x15], x11 fmla v16.8h, v3.8h, v0.h[0] diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S index f73046ec0..28d34b174 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S @@ -75,8 +75,8 @@ LoopE8: sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -139,8 +139,8 @@ LoopE8: sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -253,7 +253,7 @@ LoopE8: ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 @@ -289,7 +289,7 @@ LoopE8: ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.8h}, [x15], x11 @@ -377,8 +377,8 @@ blt E1 sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -419,8 +419,8 @@ blt E1 sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -497,7 +497,7 @@ blt E1 ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 @@ -522,7 +522,7 @@ blt E1 ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.4h}, [x15], x11 @@ -593,8 +593,8 @@ LoopE1: sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -620,8 +620,8 @@ LoopE1: sxtl2 v2.8h, v10.16b scvtf v1.8h, v1.8h scvtf v2.8h, v2.8h - mov v3.8h, v14.8h - mov v4.8h, v15.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h + mov v4.16b, v15.16b // mov v4.8h, v15.8h fmla v3.8h, v1.8h, v12.8h fmla v4.8h, v2.8h, v13.8h @@ -669,7 +669,7 @@ LoopE1: ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.h}[0], [x15], x11 @@ -687,7 +687,7 @@ LoopE1: ld1 {v10.16b}, [x13], #16 sxtl v1.8h, v10.8b scvtf v1.8h, v1.8h - mov v3.8h, v14.8h + mov v3.16b, v14.16b // mov v3.8h, v14.8h fmla v3.8h, v1.8h, v12.8h ld1 {v0.h}[0], [x15], x11 fmla v16.8h, v3.8h, v0.h[0] diff --git a/source/backend/coreml/backend/CoreMLExecutor.mm b/source/backend/coreml/backend/CoreMLExecutor.mm index 2e9d99aac..7b664a0cd 100644 --- a/source/backend/coreml/backend/CoreMLExecutor.mm +++ b/source/backend/coreml/backend/CoreMLExecutor.mm @@ -106,31 +106,20 @@ - (MLFeatureValue*)featureValueForName:(NSString*)featureName { for (auto& input : *_inputs) { if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.second) { auto input_shape = input.first->shape(); - NSArray* shape = @[ - @(input_shape[0]), - @(input_shape[1]), - @(input_shape[2]), - ]; - NSArray* strides = @[ - @(input_shape[1] * input_shape[2]), - @(input_shape[2]), - @1, - ]; - - if ([self coreMlVersion] >= 3) { - shape = @[ - @(input_shape[0]), - @(input_shape[1]), - @(input_shape[2]), - @(input_shape[3]), - ]; - strides = @[ - @(input_shape[1] * input_shape[2] * input_shape[3]), - @(input_shape[2] * input_shape[3]), - @(input_shape[3]), - @1, - ]; - }; + NSMutableArray* shape = [NSMutableArray arrayWithCapacity:input_shape.size()]; + NSMutableArray* strides = [NSMutableArray arrayWithCapacity:input_shape.size()]; + std::vector stridesDim(input_shape.size()); + int curStride = 1; + if (input_shape.size() >= 1) { + for (int i=input_shape.size()-1; i>=0; --i) { + stridesDim[i] = curStride; + curStride *= input_shape[i]; + } + } + for (int i=0; igetType() == halide_type_of()) { CVPixelBufferRef pixelBuffer = NULL; @@ -210,6 +199,7 @@ - (bool)invokeWithInputs:(const std::vector(output.first)->buffer().host = (unsigned char*)data.dataPointer; } } + inputFeature = nil; } return YES; } diff --git a/source/backend/coreml/execution/CoreMLConvolution.cpp b/source/backend/coreml/execution/CoreMLConvolution.cpp index 2d335af36..7e1a22fb6 100644 --- a/source/backend/coreml/execution/CoreMLConvolution.cpp +++ b/source/backend/coreml/execution/CoreMLConvolution.cpp @@ -29,7 +29,7 @@ void CoreMLConvolution::loadWeightBias(const std::vector &inputs) { } auto conv2D = mOp->main_as_Convolution2D(); if (nullptr != conv2D->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2D, backend(), true); + quanCommon = ConvolutionCommon::load(mOp, backend(), true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str()); } @@ -84,7 +84,7 @@ void CoreMLConvolution::addPadLayer(const Tensor * input, const Convolution2DCom bottom = (pad_out_height - inputHeight) - top; left = (pad_out_width - inputWidth) / 2; right = (pad_out_width - inputWidth) - left; - + if (top < 0 || bottom < 0 || left < 0 || right < 0) { isSamePadding = true; pad_out_width = outputWidth / sx; diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp index a420f2d0d..8a5a89ec3 100644 --- a/source/backend/cpu/CPUAttention.cpp +++ b/source/backend/cpu/CPUAttention.cpp @@ -27,87 +27,8 @@ // reduce the value of 'query' to 'query * FP16_QSCALE', avoid fp16 overflow #define FP16_QSCALE 0.5 -#define FP8_E5M2 - namespace MNN { -#if defined FP8_E5M2 // E5M2 : [S E E E E E M M] -typedef uint8_t fp8_t; -static inline fp8_t fp16_to_fp8(FLOAT16_T x) { - return *((fp8_t *)(&x) + 1); -} -static FLOAT16_T fp8_to_fp16(fp8_t x) { - uint16_t rawData = 0; - rawData |= (uint16_t)x << 8; - return *((FLOAT16_T *)(&rawData)); -} -static inline fp8_t float_to_fp8(float x) { - uint32_t rawData = *((uint32_t *)(&x)); - int sign = (rawData >> 31) & 1U; - int exp = (int)((rawData >> 23) & 0x0ffU) - 127; - if (exp < -16) - exp = -16; - if (exp > 15) - exp = 15; - exp += 16; // exp [-16, 15] ==> [0, 31] - int mant = (rawData >> 21) & 3U; - return (sign << 7) | (exp << 2) | mant; -} -static inline float fp8_to_float(fp8_t x) { - uint32_t sign = (x >> 7) & 1U; - uint32_t exp = (int)((x >> 2) & 0x1fU) - 16 + 127; - uint32_t mant = (x & 3U) << 21; - uint32_t rawData = (sign << 31) | (exp << 23) | mant; - return *((float *)(&rawData)); -} -#elif defined FP8_E4M3 // E4M3: [S E E E E M M M] -typedef uint8_t fp8_t; -static inline fp8_t fp16_to_fp8(FLOAT16_T x) { - uint16_t rawData = *((uint16_t *)(&x)); - int sign = (rawData >> 15) & 1U; - int exp = (int)((rawData >> 10) & 0x1fU) - 15; - if (exp < -8) - exp = -8; - if (exp > 7) - exp = 7; - exp += 8; // exp [-8, 7] ==> [0, 15] - int mant = (rawData >> 7) & 7U; - return (sign << 7) | (exp << 3) | mant; -} -static FLOAT16_T fp8_to_fp16(fp8_t x) { - uint32_t sign = (x >> 7) & 1U; - uint32_t exp = (int)((x >> 3) & 0x0fU) - 8 + 15; - uint32_t mant = (x & 7U) << 7; - uint16_t rawData = (sign << 15) | (exp << 10) | mant; - return *((FLOAT16_T *)(&rawData)); -} -static inline fp8_t float_to_fp8(float x) { - uint32_t rawData = *((uint32_t *)(&x)); - int sign = (rawData >> 31) & 1U; - int exp = (int)((rawData >> 23) & 0x0ffU) - 127; - if (exp < -8) - exp = -8; - if (exp > 7) - exp = 7; - exp += 8; // exp [-8, 7] ==> [0, 15] - int mant = (rawData >> 20) & 7U; - return (sign << 7) | (exp << 3) | mant; -} -static inline float fp8_to_float(fp8_t x) { - uint32_t sign = (x >> 7) & 1U; - uint32_t exp = (int)((x >> 3) & 0x0fU) - 8 + 127; - uint32_t mant = (x & 7U) << 20; - uint32_t rawData = (sign << 31) | (exp<< 23) | mant; - return *((float *)(&rawData)); -} -#else -// Do not support fp8 -#endif // fp8 format definition - -static int nearestInt(float x) { - return x < 0 ? -nearestInt(-x) : (int)(x + 0.5f); -} - template static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim, int eP, int seq_len, int h, float q_scale) { T * query_src = query->host(); @@ -121,99 +42,6 @@ static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim, } } -template -static void pack_key(Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, char* scale, char* zero_point, bool quant) { - if (quant) { // Quantize the keys - auto key_src = key->host(); - auto key_dst = reinterpret_cast(pack_key); - auto scale_dst = reinterpret_cast(scale); - auto zeroPoint_dst = reinterpret_cast(zero_point); - for (int i = 0; i < seq_len; i++) { - float minKey = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + 0]; - float maxKey = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + 0]; - for (int j = 1; j < mHeadDim; j++) { - auto key = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j]; - minKey = ALIMIN(minKey, key); - maxKey = ALIMAX(maxKey, key); - } - int out_index = (mPastLength + i) / hP; - int in_index = (mPastLength + i) % hP; - scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f; - zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey; - for (int j = 0; j < mHeadDim; j++) { - key_dst[out_index * mHeadDim * hP + j * hP + in_index] = nearestInt((key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j] - minKey) / (maxKey - minKey) * 255 - 128); - } - } - } - else { // Do not quantize the keys - auto key_src = key->host(); - auto key_dst = reinterpret_cast(pack_key); - for (int i = 0; i < seq_len; i++) { - int out_index = (mPastLength + i) / hP; - int in_index = (mPastLength + i) % hP; - for (int j = 0; j < mHeadDim; j++) { - key_dst[out_index * mHeadDim * hP + j * hP + in_index] = key_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j]; - } - } - } -} - - - -template -static void pack_value(Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quant) { - if (quant) { // Quantize the values to fp8 - T * value_src = value->host(); - fp8_t * value_dst = reinterpret_cast(pack_value); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < mHeadDim; j++) { - int out_index = j / hP; - int in_index = j % hP; - auto origin = value_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j]; - if (sizeof(T) == 2) - value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = fp16_to_fp8(origin); - else - value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = float_to_fp8(origin); - } - } - } - else { // Do not quantize the values - T * value_src = value->host(); - T * value_dst = reinterpret_cast(pack_value); - for (int i = 0; i < seq_len; i++) { - for (int j = 0; j < mHeadDim; j++) { - int out_index = j / hP; - int in_index = j % hP; - value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = value_src[i * mKvNumHead * mHeadDim + kv_h * mHeadDim + j]; - } - } - } -} - -void dequant_value_float(char * dst, char * src, int mHeadDim, int kv_seq_len, int hP, int mMaxLength) { - fp8_t * qv = (fp8_t *)src; - float * dqv = (float *)dst; - for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { - for (int j = 0; j < kv_seq_len; j++) { - for (int k = 0; k < hP; k++) { - dqv[i * kv_seq_len * hP + j * hP + k] = fp8_to_float(qv[i * mMaxLength * hP + j * hP + k]); - } - } - } -} - -void dequant_value_fp16(char * dst, char * src, int mHeadDim, int kv_seq_len, int hP, int mMaxLength) { - fp8_t * qv = (fp8_t *)src; - FLOAT16_T * dqv = (FLOAT16_T *)dst; - for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { - for (int j = 0; j < kv_seq_len; j++) { - for (int k = 0; k < hP; k++) { - dqv[i * kv_seq_len * hP + j * hP + k] = fp8_to_fp16(qv[i * mMaxLength * hP + j * hP + k]); - } - } - } -} - template static void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len, int unit) { float * dst = unpack_qk_dst; @@ -285,95 +113,6 @@ static void unpack_QKV(char* pack_qkv, char* unpack_qkv, int mNumHead, int mHead } } -void CPUAttention::allocKVCache(int kv_seq_len, bool quantKey, bool quantValue) { - if (!mKVCache) { - return; - } - mResource->mMaxLength = kv_seq_len + mResource->mExpandChunk; - if (quantKey) { - mResource->mPastKey.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP})); - mResource->mDequantKeyScale.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP})); - mResource->mDequantKeyZeroPoint.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP})); - backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC); - backend()->onAcquireBuffer(mResource->mDequantKeyScale.get(), Backend::STATIC); - backend()->onAcquireBuffer(mResource->mDequantKeyZeroPoint.get(), Backend::STATIC); - } else { - mResource->mPastKey.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP})); - backend()->onAcquireBuffer(mResource->mPastKey.get(), Backend::STATIC); - } - if (quantValue) { - mResource->mPastValue.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP})); - backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC); - } else { - mResource->mPastValue.reset(Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP})); - backend()->onAcquireBuffer(mResource->mPastValue.get(), Backend::STATIC); - } -} - -void CPUAttention::reallocKVCache(int kv_seq_len, bool quantKey, bool quantValue) { - if (!mKVCache || kv_seq_len <= mResource->mMaxLength) { - return; - } - int oldMaxLength = mResource->mMaxLength; - mResource->mMaxLength = kv_seq_len + mResource->mExpandChunk; - if (quantKey) { - auto new_key = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}); - auto new_scale = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP}); - auto new_zeroPoint = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), 1, hP}); - backend()->onAcquireBuffer(new_key, Backend::STATIC); - backend()->onAcquireBuffer(new_scale, Backend::STATIC); - backend()->onAcquireBuffer(new_zeroPoint, Backend::STATIC); - for (int h = 0; h < mResource->mKvNumHead; h++) { - memcpy(new_key->host() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP, - mResource->mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP, - UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP); - memcpy(new_scale->host() + h * UP_DIV(mResource->mMaxLength, hP) * hP * bytes, - mResource->mDequantKeyScale->host() + h * UP_DIV(oldMaxLength, hP) * hP * bytes, - UP_DIV(oldMaxLength, hP) * hP * bytes); - memcpy(new_zeroPoint->host() + h * UP_DIV(mResource->mMaxLength, hP) * hP * bytes, - mResource->mDequantKeyZeroPoint->host() + h * UP_DIV(oldMaxLength, hP) * hP * bytes, - UP_DIV(oldMaxLength, hP) * hP * bytes); - } - mResource->mPastKey.reset(new_key); - mResource->mDequantKeyScale.reset(new_scale); - mResource->mDequantKeyZeroPoint.reset(new_zeroPoint); - } - else { - auto new_key = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mMaxLength, hP), mResource->mHeadDim, hP}); - backend()->onAcquireBuffer(new_key, Backend::STATIC); - for (int h = 0; h < mResource->mKvNumHead; h++) { - memcpy(new_key->host() + h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes, - mResource->mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP * bytes, - UP_DIV(oldMaxLength, hP) * mResource->mHeadDim * hP * bytes); - } - mResource->mPastKey.reset(new_key); - } - if (quantValue) { - auto new_value = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP}); - backend()->onAcquireBuffer(new_value, Backend::STATIC); - for (int h = 0; h < mResource->mKvNumHead; h++) { - for (int i = 0; i < UP_DIV(mResource->mHeadDim, hP); i++) { - memcpy(new_value->host() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * mResource->mMaxLength * hP, - mResource->mPastValue->host() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * oldMaxLength * hP, - oldMaxLength * hP); - } - } - mResource->mPastValue.reset(new_value); - } - else { - auto new_value = Tensor::createDevice({mResource->mKvNumHead, UP_DIV(mResource->mHeadDim, hP), mResource->mMaxLength, hP}); - backend()->onAcquireBuffer(new_value, Backend::STATIC); - for (int h = 0; h < mResource->mKvNumHead; h++) { - for (int i = 0; i < UP_DIV(mResource->mHeadDim, hP); i++) { - memcpy(new_value->host() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * mResource->mMaxLength * hP * bytes, - mResource->mPastValue->host() + (h * UP_DIV(mResource->mHeadDim, hP) + i) * oldMaxLength * hP * bytes, - oldMaxLength * hP * bytes); - } - } - mResource->mPastValue.reset(new_value); - } -} - ErrorCode CPUAttention::onResize(const std::vector& inputs, const std::vector& outputs) { auto core = static_cast(backend())->functions(); core->MNNGetMatMulPackMode(&eP, &lP, &hP); @@ -383,11 +122,12 @@ ErrorCode CPUAttention::onResize(const std::vector& inputs, const std:: auto query = inputs[0]; auto key = inputs[1]; int seq_len = query->shape()[1]; - mResource->mNumHead = query->shape()[2]; - mResource->mHeadDim = query->shape()[3]; - mResource->mKvNumHead = key->shape()[2]; - mPackQ.reset(Tensor::createDevice({mThreadNum, UP_DIV(seq_len, eP), mResource->mHeadDim, eP})); - mPackQKV.reset(Tensor::createDevice({mThreadNum, UP_DIV(mResource->mHeadDim, unit), seq_len, unit})); + mNumHead = query->shape()[2]; + mHeadDim = query->shape()[3]; + mKvNumHead = key->shape()[2]; + mKVCacheManager->onResize(mKvNumHead, mHeadDim); + mPackQ.reset(Tensor::createDevice({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP})); + mPackQKV.reset(Tensor::createDevice({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit})); backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC); backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC); @@ -396,7 +136,7 @@ ErrorCode CPUAttention::onResize(const std::vector& inputs, const std:: } ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std::vector& outputs) { - auto core = static_cast(backend())->functions(); + auto core = static_cast(backend())->functions(); auto query = inputs[0]; auto key = inputs[1]; auto value = inputs[2]; @@ -410,19 +150,10 @@ ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std: mIsPrefill = (seq_len > 1); // isPrefill and mask is Square Matrix, is FirstPrefill mIsFirstPrefill = mIsPrefill && (mask_kvlen == mask_seqlen); - int tileCount = UP_DIV(mResource->mNumHead, mThreadNum); - int group_size = mResource->mNumHead / mResource->mKvNumHead; - - // 0: do not quant kv - // 1: only quant k - // 2: only quant v - // 3: quant kv - int quantKV = static_cast(backend())->getRuntime()->hint().kvcacheQuantOption; - bool quantKey = (quantKV & 1) == 1; - bool quantValue = ((quantKV >> 1) & 1) == 1; - + int tileCount = UP_DIV(mNumHead, mThreadNum); + int group_size = mNumHead / mKvNumHead; // reduce the value of 'query' to avoid fp16 overflow - float mScale = 1.0 / sqrt(mResource->mHeadDim); + float mScale = 1.0 / sqrt(mHeadDim); float q_scale = 1.0; if (bytes == 2) { q_scale = FP16_QSCALE; @@ -430,133 +161,70 @@ ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std: } if (mIsPrefill) { - // Only reset the kvcache in the first prefill, but keep the kvcache in subsequent prefill if (mIsFirstPrefill) { - mResource->mPastLength = 0; - allocKVCache(seq_len, quantKey, quantValue); + mKVCacheManager->onClear(); + mKVCacheManager->onAlloc(seq_len); } else { - reallocKVCache(mResource->mPastLength + seq_len, quantKey, quantValue); + mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + seq_len); } } else { // Decode - reallocKVCache(mResource->mPastLength + 1, quantKey, quantValue); + mKVCacheManager->onRealloc(mKVCacheManager->kvLength() + 1); } - int kv_seq_len = mResource->mPastLength + seq_len; - + // Add the new kv to the kvcache + mKVCacheManager->onPushBack(key, value); + int kv_seq_len = mKVCacheManager->kvLength(); + int max_len = mKVCacheManager->maxLength(); + bool quant_key = mKVCacheManager->config()->mQuantKey; + bool quant_value = mKVCacheManager->config()->mQuantValue; // Temporary tensors for intermediate results std::shared_ptr packQK(Tensor::createDevice({mThreadNum, UP_DIV(kv_seq_len, unit), seq_len, unit})); std::shared_ptr unpackQK(Tensor::createDevice({mThreadNum, seq_len, kv_seq_len})); std::shared_ptr softmaxQK(Tensor::createDevice({mThreadNum, seq_len, kv_seq_len})); std::shared_ptr newPackQK(Tensor::createDevice({mThreadNum, UP_DIV(seq_len, eP), kv_seq_len, eP})); - std::shared_ptr dequantV(Tensor::createDevice({mThreadNum, UP_DIV(mResource->mHeadDim, hP), kv_seq_len, hP})); + std::shared_ptr dequantV(Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), kv_seq_len, hP})); backend()->onAcquireBuffer(packQK.get(), Backend::STATIC); backend()->onAcquireBuffer(unpackQK.get(), Backend::STATIC); backend()->onAcquireBuffer(softmaxQK.get(), Backend::STATIC); backend()->onAcquireBuffer(newPackQK.get(), Backend::STATIC); - if (quantValue) { + if (quant_value) { backend()->onAcquireBuffer(dequantV.get(), Backend::STATIC); + mKVCacheManager->onDequantValue(dequantV.get()); } std::function mCompute = [=](int tId) { - auto pack_q = mPackQ->host() + tId * UP_DIV(seq_len, eP) * mResource->mHeadDim * eP * bytes; + auto pack_q = mPackQ->host() + tId * UP_DIV(seq_len, eP) * mHeadDim * eP * bytes; auto pack_qk = packQK->host() + tId * UP_DIV(kv_seq_len, unit) * seq_len * unit * bytes; auto unpack_qk = unpackQK->host() + tId * seq_len * kv_seq_len; - auto softmax_qk = softmaxQK->host() + tId * seq_len * kv_seq_len; + auto softmax_qk = softmaxQK->host() + tId * seq_len * kv_seq_len; auto new_pack_qk = newPackQK->host() + tId * UP_DIV(seq_len, eP) * kv_seq_len * eP * bytes; - auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mResource->mHeadDim, unit) * seq_len * unit * bytes; - int head_index = tId * tileCount; - for (int h = head_index; h < head_index + tileCount && h < mResource->mNumHead; h++) { - int kv_h = h / group_size; - char * key_dst = nullptr; - char * key_scale_dst = nullptr; - char * key_zero_point_dst = nullptr; - char * value_dst = nullptr; - if (quantKey) { - key_dst = mResource->mPastKey->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP; - key_scale_dst = mResource->mDequantKeyScale->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * 1 * hP * bytes; - key_zero_point_dst = mResource->mDequantKeyZeroPoint->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * 1 * hP * bytes; - } else { - key_dst = mResource->mPastKey->host() + kv_h * UP_DIV(mResource->mMaxLength, hP) * mResource->mHeadDim * hP * bytes; - } - if (quantValue) { - value_dst = mResource->mPastValue->host() + kv_h * UP_DIV(mResource->mHeadDim, hP) * mResource->mMaxLength * hP; - } else { - value_dst = mResource->mPastValue->host() + kv_h * UP_DIV(mResource->mHeadDim, hP) * mResource->mMaxLength * hP * bytes; - } - // pack for matmul + auto pack_qkv = mPackQKV->host() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes; + auto QxK = quant_key ? core->MNNPackedMatMul_int8 : core->MNNPackedMatMul; + auto QxK_remain = quant_key ? core->MNNPackedMatMulRemain_int8 : core->MNNPackedMatMulRemain; + int head_index = tId * tileCount; + for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) { + int kv_h = h / group_size; + char * key_addr = mKVCacheManager->addrOfKey(kv_h); + char * scale_addr = quant_key ? mKVCacheManager->addrOfScale(kv_h) : nullptr; + char * zero_point_addr = quant_key ? mKVCacheManager->addrOfZeroPoint(kv_h) : nullptr; + char * value_addr = quant_value ? dequantV->host() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes : mKVCacheManager->addrOfValue(kv_h); if (bytes == 2) { - pack_query(query, pack_q, mResource->mNumHead, mResource->mHeadDim, eP, seq_len, h, q_scale); - pack_key(key, key_dst, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, key_scale_dst, key_zero_point_dst, quantKey); - pack_value(value, value_dst, mResource->mMaxLength, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, quantValue); + pack_query(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale); } else { - pack_query(query, pack_q, mResource->mNumHead, mResource->mHeadDim, eP, seq_len, h, q_scale); - pack_key(key, key_dst, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, key_scale_dst, key_zero_point_dst, quantKey); - pack_value(value, value_dst, mResource->mMaxLength, mResource->mPastLength, seq_len, mResource->mKvNumHead, mResource->mHeadDim, hP, kv_h, quantValue); + pack_query(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale); } // query @ key int loop_e = seq_len / eP; int remain = seq_len % eP; + size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0}; for (int i = 0 ; i < loop_e; i++) { - size_t shapeParameters[7]; - size_t* parameters = shapeParameters; - parameters[0] = eP * bytes; - parameters[1] = mResource->mHeadDim; - parameters[2] = kv_seq_len; - parameters[3] = seq_len * unit * bytes; - parameters[4] = 0; - parameters[5] = 0; - parameters[6] = 0; - if (quantKey) { - core->MNNPackedMatMul_int8( - (float*)(pack_qk + (i * eP * unit) * bytes), - (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes), - (float*)key_dst, - parameters, nullptr, nullptr, - (float*)key_scale_dst, (float*)key_zero_point_dst - ); - } else { - core->MNNPackedMatMul( - (float*)(pack_qk + (i * eP * unit) * bytes), - (float*)(pack_q + (i * mResource->mHeadDim * eP) * bytes), - (float*)key_dst, - parameters, nullptr, nullptr, - nullptr, nullptr - ); - } - } - { - size_t shapeParameters[7]; - size_t* parameters = shapeParameters; - parameters[0] = eP * bytes; - parameters[1] = mResource->mHeadDim; - parameters[2] = kv_seq_len; - parameters[3] = seq_len * unit * bytes; - parameters[4] = 0; - parameters[5] = 0; - parameters[6] = 0; - if (quantKey) { - core->MNNPackedMatMulRemain_int8( - (float*)(pack_qk + (loop_e * eP * unit) * bytes), - (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes), - (float*)key_dst, - remain, parameters, nullptr, nullptr, - (float*)key_scale_dst, (float*)key_zero_point_dst - ); - } else { - core->MNNPackedMatMulRemain( - (float*)(pack_qk + (loop_e * eP * unit) * bytes), - (float*)(pack_q + (loop_e * mResource->mHeadDim * eP) * bytes), - (float*)key_dst, - remain, parameters, nullptr, nullptr, - nullptr, nullptr - ); - } + QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr); } + QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr); + // qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP] if(bytes == 2) { - // unpack qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len] unpack_QK(unpack_qk, pack_qk, seq_len, kv_seq_len, unit); mask_QK(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits::lowest(), mask->host(), float_mask); softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len); - // pack qk for qk @ v : [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP] pack_QK(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP); } else { unpack_QK(unpack_qk, pack_qk, seq_len, kv_seq_len, unit); @@ -564,56 +232,20 @@ ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std: softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len); pack_QK(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP); } - // Dequantize values from fp8 to float - if (quantValue) { - char * qv = value_dst; - char * dqv = dequantV->host() + tId * UP_DIV(mResource->mHeadDim, hP) * kv_seq_len * hP * bytes; - if (bytes == 2) { - dequant_value_fp16(dqv, qv, mResource->mHeadDim, kv_seq_len, hP, mResource->mMaxLength); - } else { - dequant_value_float(dqv, qv, mResource->mHeadDim, kv_seq_len, hP, mResource->mMaxLength); - } - value_dst = dqv; - } // qk @ v + shapeParameters[1] = kv_seq_len; + shapeParameters[2] = mHeadDim; + shapeParameters[5] = quant_value ? 0 : (max_len - kv_seq_len) * hP * bytes; for (int i = 0 ; i < loop_e; i++) { - size_t shapeParameters[6]; - size_t* parameters = shapeParameters; - parameters[0] = eP * bytes; - parameters[1] = kv_seq_len; - parameters[2] = mResource->mHeadDim; - parameters[3] = seq_len * unit * bytes; - parameters[4] = 0; - parameters[5] = quantValue ? 0 : (mResource->mMaxLength - kv_seq_len) * hP * bytes; - core->MNNPackedMatMul( - (float*)(pack_qkv + (i * eP * unit) * bytes), - (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes), - (float*)value_dst, parameters, - nullptr, nullptr, nullptr, nullptr - ); - } - { - size_t shapeParameters[6]; - size_t* parameters = shapeParameters; - parameters[0] = eP * bytes; - parameters[1] = kv_seq_len; - parameters[2] = mResource->mHeadDim; - parameters[3] = seq_len * unit * bytes; - parameters[4] = 0; - parameters[5] = quantValue ? 0 : (mResource->mMaxLength - kv_seq_len) * hP * bytes; - core->MNNPackedMatMulRemain( - (float*)(pack_qkv + (loop_e * eP * unit) * bytes), - (float*)(new_pack_qk + (loop_e * kv_seq_len * eP) * bytes), - (float*)value_dst, remain, parameters, - nullptr, nullptr, nullptr, nullptr - ); + core->MNNPackedMatMul((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes), (float*)value_addr, shapeParameters, nullptr, nullptr, nullptr, nullptr); } + core->MNNPackedMatMulRemain((float*)(pack_qkv + (loop_e * eP * unit) * bytes), (float*)(new_pack_qk + (loop_e * kv_seq_len * eP) * bytes), (float*)value_addr, remain, shapeParameters, nullptr, nullptr, nullptr, nullptr); // unpack: [head_dim/unit, seq_len, unit] -> [seq_len, num_head, head_dim] - auto dst_ptr = outputs[0]->host() + h * mResource->mHeadDim * bytes; + auto dst_ptr = outputs[0]->host() + h * mHeadDim * bytes; if (bytes == 2) { - unpack_QKV(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len); + unpack_QKV(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len); } else { - unpack_QKV(pack_qkv, dst_ptr, mResource->mNumHead, mResource->mHeadDim, unit, seq_len); + unpack_QKV(pack_qkv, dst_ptr, mNumHead, mHeadDim, unit, seq_len); } } }; @@ -623,12 +255,11 @@ ErrorCode CPUAttention::onExecute(const std::vector& inputs, const std: } MNN_CONCURRENCY_END(); - mResource->mPastLength += seq_len; backend()->onReleaseBuffer(packQK.get(), Backend::STATIC); backend()->onReleaseBuffer(unpackQK.get(), Backend::STATIC); backend()->onReleaseBuffer(softmaxQK.get(), Backend::STATIC); backend()->onReleaseBuffer(newPackQK.get(), Backend::STATIC); - if (quantValue){ + if (quant_value){ backend()->onReleaseBuffer(dequantV.get(), Backend::STATIC); } return NO_ERROR; @@ -639,14 +270,26 @@ bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) { return true; } auto tmp = new CPUAttention(bn, mKVCache); - tmp->mResource = mResource; + tmp->mKVCacheManager = mKVCacheManager; *dst = tmp; return true; } -CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend) { - mKVCache = kv_cache; - mResource.reset(new Resource); +CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend), mKVCache(kv_cache) { + if (mKVCache) { + MNN::KVCacheManager::KVCacheConfig kvconfig; + int kvcacheQuantOptions = static_cast(backend)->getRuntime()->hint().kvcacheQuantOption; + kvconfig.mQuantKey = (kvcacheQuantOptions & 1); + kvconfig.mQuantValue = ((kvcacheQuantOptions >> 1) & 1); + kvconfig.mKVCacheDir = static_cast(backend)->getRuntime()->hint().kvcacheDirPath; + kvconfig.mKVCacheSizeLimit = static_cast(backend)->getRuntime()->hint().kvcacheSizeLimit; + kvconfig.mExpandChunk = 64; + mKVCacheManager.reset(new KVCacheManager(backend, kvconfig)); + } +} + +CPUAttention::~CPUAttention() { + } class CPUAttentionCreator : public CPUBackend::Creator { @@ -662,4 +305,4 @@ REGISTER_CPU_OP_CREATOR_TRANSFORMER(CPUAttentionCreator, OpType_Attention); } // namespace MNN -#endif \ No newline at end of file +#endif // MNN_SUPPORT_TRANSFORMER_FUSE \ No newline at end of file diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp index abf351249..4aba816f3 100644 --- a/source/backend/cpu/CPUAttention.hpp +++ b/source/backend/cpu/CPUAttention.hpp @@ -13,38 +13,32 @@ #include #include "core/Execution.hpp" +#include "MNN/ErrorCode.hpp" +#include "KVCacheManager.hpp" namespace MNN { - class CPUAttention : public Execution { public: CPUAttention(Backend *backend, bool kv_cache); - virtual ~CPUAttention() = default; + virtual ~CPUAttention(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; - struct Resource { - std::shared_ptr mPastKey; // numhead, [maxlen/eP, headdim, eP] - std::shared_ptr mPastValue; // numhead, [headdim/eP, maxlen, eP] - std::shared_ptr mDequantKeyScale; // numhead, [maxlen/eP, 1, eP] - std::shared_ptr mDequantKeyZeroPoint; // numhead, [maxlen/eP, 1, eP] - int mPastLength = 0, mMaxLength = 0; - const int mExpandChunk = 64; - int mNumHead = 0, mKvNumHead = 0, mHeadDim = 0; - }; private: - void allocKVCache(int kv_seq_len, bool quantK, bool quantV); - void reallocKVCache(int kv_seq_len, bool quantK, bool quantV); - bool mIsPrefill = true; + bool mIsPrefill = true; bool mIsFirstPrefill = true; - bool mKVCache; - int mThreadNum = 1; - std::shared_ptr mResource; + bool mKVCache = true; + int bytes = 4; + int mThreadNum = 1;; + int eP, lP, hP, unit; + int mNumHead, mKvNumHead, mHeadDim; std::shared_ptr mPackQ, mPackQKV; - int eP, lP, hP, bytes, unit; + std::shared_ptr mKVCacheManager = nullptr; }; + } // namespace MNN #endif // CPUATTENTION_HPP -#endif + +#endif // MNN_SUPPORT_TRANSFORMER_FUSE diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp index 5f1a75eab..99156a447 100644 --- a/source/backend/cpu/CPUBackend.cpp +++ b/source/backend/cpu/CPUBackend.cpp @@ -192,12 +192,14 @@ void CPURuntime::_resetThreadPool() { // Reset tid to rebind cpu if necessary mCurrentTID = 0; } -void CPURuntime::onReset(int numberThread, const BackendConfig* config) { +void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool full) { if (config != nullptr) { - mPrecision = config->precision; mPower = config->power; - mMemory = config->memory; - mFlags = config->flags; + if (full) { + mPrecision = config->precision; + mMemory = config->memory; + mFlags = config->flags; + } } mThreadNumber = numberThread; _resetThreadPool(); diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp index 1ac8721de..1286df907 100644 --- a/source/backend/cpu/CPUBackend.hpp +++ b/source/backend/cpu/CPUBackend.hpp @@ -25,7 +25,7 @@ class CPURuntime : public Runtime { virtual ~ CPURuntime(); int onGetRuntimeStatus(RuntimeStatus statusEnum) const override; virtual Backend* onCreate(const BackendConfig* config) const override; - virtual void onReset(int numberThread, const BackendConfig* config) override; + virtual void onReset(int numberThread, const BackendConfig* config, bool full) override; virtual void onGabageCollect(int level) override; virtual float onGetMemoryInMB() override; virtual CompilerType onGetCompilerType() const override { diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp index 9c42008d9..109b4cc6a 100644 --- a/source/backend/cpu/CPUConvolution.cpp +++ b/source/backend/cpu/CPUConvolution.cpp @@ -85,7 +85,7 @@ CPUConvolution::MutableResourceInt8::MutableResourceInt8(std::shared_ptronAcquireBuffer(mScaleFloat.get(), Backend::STATIC); } - + } void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector inputQuantInfo, std::vector outputQuantInfo) { @@ -116,7 +116,7 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vectormOutputCount; const int kernelNum = static_cast(mResource->mInt8WeightKernelSum.size()); auto biasData = mResource->mOriginBias->host(); @@ -143,22 +143,27 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Convolution2D *convParam, int pack) { +std::shared_ptr CPUConvolution::makeResourceInt8(Backend* backend, const MNN::Op* op, int pack) { + auto convParam = op->main_as_Convolution2D(); auto core = static_cast(backend)->functions(); // TODO: use different pack from float int UNIT = pack; - + std::shared_ptr resource(new ResourceInt8); // TODO: ConvInt8Winograd need in/out scale, which isn't exist in quantinfo when model construct by V3 API const auto convCommon = convParam->common(); const auto group = convParam->common()->group(); const auto outputCount = convCommon->outputCount(); const auto outputChannleUp4 = UP_DIV(outputCount, UNIT) * UNIT; - + int quanCount = outputChannleUp4; - if (convParam->quanParameter() && convParam->quanParameter()->alpha()) { + if (convParam->quanParameter() && convParam->quanParameter()->alpha() && convParam->quanParameter()->buffer()) { // For block quant models. quanCount = convParam->quanParameter()->alpha()->size(); - quanCount = ROUND_UP(quanCount, UNIT); + quanCount = ROUND_UP(quanCount, UNIT); // If block quant applied, quanCount > outputChannelUp4 + if (quanCount < outputChannleUp4) { + MNN_PRINT("quantCount < outputUp4, check if need.\n"); + quanCount = outputChannleUp4; + } } resource->mOriginBias.reset(Tensor::createDevice({quanCount})); resource->mOriginScale.reset(Tensor::createDevice({quanCount * core->bytes})); @@ -185,7 +190,7 @@ std::shared_ptr CPUConvolution::makeResourceInt8(B int weightSize = 0; std::shared_ptr quanCommon; resource->mOutputCount = outputCount; - if (!ConvolutionCommon::getConvInt8Parameters(convParam, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) { + if (!ConvolutionCommon::getConvInt8Parameters(op, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) { return nullptr; } if (convParam->bias() && convParam->quanParameter()->alpha()) { @@ -195,6 +200,7 @@ std::shared_ptr CPUConvolution::makeResourceInt8(B resource->mWeightAsymmetricQuant = quanCommon->asymmetric; } + // TODO: first alloc. resource->mWeightInt8.reset(Tensor::createDevice({weightSize})); allocRes = backend->onAcquireBuffer(resource->mWeightInt8.get(), Backend::STATIC); if (!allocRes) { @@ -238,6 +244,7 @@ std::shared_ptr CPUConvolution::makeResourceInt8(B resource->mOutputScale = convParam->quanParameter()->scaleOut(); } auto weightDst = resource->mWeightInt8->host(); + // TODO(yanxing): don't copy! memcpy(weightDst, weightSrc, resource->mWeightInt8->size()); resource->mRelu = convCommon->relu() || convCommon->relu6(); if (convParam->symmetricQuan() && convParam->symmetricQuan()->outputDataType() == MNN::DataType_DT_FLOAT) { @@ -247,8 +254,9 @@ std::shared_ptr CPUConvolution::makeResourceInt8(B return resource; } -void CPUConvolution::makeResource(Backend* backend, std::shared_ptr resource, const Convolution2D* conv2d, std::shared_ptr resourceInt8) { +void CPUConvolution::makeResource(Backend* backend, std::shared_ptr resource, const MNN::Op *op, std::shared_ptr resourceInt8) { /* Used to compute weight quant scale and bias and weightKernelSum of type float. */ + auto conv2d = op->main_as_Convolution2D(); bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr); MNN_ASSERT(quanBuffer || resourceInt8); resource->backend = backend; @@ -269,11 +277,11 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr re auto alphaPtr = resource->mDequantize.mScaleBias->host(); auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + ocUp4 * core->bytes); ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes); - + std::shared_ptr quantCommon; // Load quant scale and bias if (quanBuffer) { - quantCommon = ConvolutionCommon::load(conv2d, backend, false, true); + quantCommon = ConvolutionCommon::load(op, backend, false, true); weightOrigin = quantCommon->weight.get(); // weight before reorder int h = quantCommon->alpha.size(); @@ -321,7 +329,7 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr re } } } - + // Compute float weightKernelSum resource->mWeightKernelSum.reset(Tensor::createDevice({ocUp4 * 4})); success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC); @@ -347,6 +355,73 @@ void CPUConvolution::makeResource(Backend* backend, std::shared_ptr re } } +void CPUConvolution::makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr resourceInt8) { + /* Used to compute weight quant scale and bias and weightKernelSum of type float. */ + bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr); + MNN_ASSERT(quanBuffer || resourceInt8); + auto core = static_cast(backend)->functions(); + // common parameters + int outputCount = conv2d->common()->outputCount(); + int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY(); + int ocUp4 = ROUND_UP(outputCount, core->pack); + int8_t* weightOrigin; + + // Save weight quant scale and bias: wf=scale*wi+bias + std::shared_ptr scaleBias(Tensor::createDevice({2 * ocUp4 * core->bytes})); + auto success = backend->onAcquireBuffer(scaleBias.get(), Backend::STATIC); + if (!success) { + MNN_ERROR("Alloc dequant scaleBias memory error\n"); + return; + } + auto alphaPtr = scaleBias->host(); + auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + ocUp4 * core->bytes); + ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes); + + // Load quant scale and bias + weightOrigin = resourceInt8->mWeightInt8->host(); + auto wZero = resourceInt8->mWeightQuantZero->host(); // has packed to outputUp4 + auto wScale = resourceInt8->mOriginScale->host(); + int h = ocUp4; + if (core->bytes == 2) { + std::unique_ptr tmp(new int16_t[h]); + core->MNNFp32ToLowp(wScale, tmp.get(), h); + for (int i=0; i< h; ++i) { + reinterpret_cast(alphaPtr)[i] = tmp[i]; + reinterpret_cast(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i]; + } + } else { + for (int i=0; i< h; ++i) { + alphaPtr[i] = wScale[i]; + biasPtr[i] = (-1.f) * wZero[i] * wScale[i]; + } + } + resourceInt8->mOriginScale = scaleBias; + + // Compute float weightKernelSum + resourceInt8->mWeightKernelSum.reset(Tensor::createDevice({ocUp4 * 4})); + success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC); + if (!success) { + MNN_ERROR("Alloc dequant mWeightKernelSum memory error\n"); + return; + } + auto weightKernelSum = resourceInt8->mWeightKernelSum->host(); + for (int i = 0; i < outputCount; ++i) { + int sum = 0; + for (int j = 0; j < LSize; ++j) { + sum = sum + static_cast(weightOrigin[j + i * LSize]); + } + if(core->bytes == 2) { + auto scale = reinterpret_cast(alphaPtr)[i]; + auto bias = reinterpret_cast(biasPtr)[i]; + weightKernelSum[i] = static_cast(sum) * scale + LSize * bias; + } else { + auto scale = alphaPtr[i]; + auto bias = biasPtr[i]; + weightKernelSum[i] = static_cast(sum) * scale + LSize * bias; + } + } +} + CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) { // Do nothing } @@ -399,16 +474,16 @@ class CPUConvInt8Creator : public CPUBackend::Creator { return OneDNNConvInt8::create(backend, convOp, inputs, outputs); #endif auto core = static_cast(backend)->functions(); - auto res = CPUConvolution::makeResourceInt8(backend, convOp, core->pack); + auto res = CPUConvolution::makeResourceInt8(backend, op, core->pack); #ifdef MNN_USE_SPARSE_COMPUTE if (static_cast(backend)->functions()->pack == 4 && convOp->sparseParameter() && SparseConvInt8TiledExecutor::shouldUseSparse(convOp)) { - return new SparseConvInt8TiledExecutor(backend, convOp, res); + return new SparseConvInt8TiledExecutor(backend, op, res); } #endif if (ConvInt8Winograd::mustUse(convOp)) { return new ConvInt8Winograd(backend, convOp, res); } - return new DenseConvInt8TiledExecutor(backend, convOp, res, false); + return new DenseConvInt8TiledExecutor(backend, op, res); } }; diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp index d241007d6..a34f68aad 100644 --- a/source/backend/cpu/CPUConvolution.hpp +++ b/source/backend/cpu/CPUConvolution.hpp @@ -58,14 +58,16 @@ class CPUConvolution : public Execution { std::vector mReluThreshold; }; struct ResourceInt8 { - std::vector mInt8WeightKernelSum; - std::shared_ptr mWeightInt8; - std::shared_ptr mOriginBias; - std::shared_ptr mOriginScale; - std::shared_ptr mWeightQuantZero; + std::vector mInt8WeightKernelSum; // PTQ's sum, DynamicQ not use + std::shared_ptr mWeightInt8; // PTQ's and DynamicQ's weight + std::shared_ptr mOriginBias; // PTQ's and DynamicQ's bias + std::shared_ptr mOriginScale; // PTQ's scale + bias, DynamicQ's alpha + zero; + std::shared_ptr mWeightQuantZero; // PTQ's zero + std::shared_ptr mWeightKernelSum; // PTQ's and DynamicQ's weight kernel sum; + std::vector mReluThreshold; // relu or relu6 bool mRelu; - int mActBits; + int mActBits; // quant bits int mOutputCount; bool mUseConvQuan = true; @@ -97,8 +99,9 @@ class CPUConvolution : public Execution { int32_t mShiftBits = 14; bool mValid; }; - static std::shared_ptr makeResourceInt8(Backend *backend, const MNN::Convolution2D *convOp, int pack=4); - static void makeResource(Backend* backend, std::shared_ptr resource, const Convolution2D* conv2d, std::shared_ptr resourceInt8 = nullptr); + static std::shared_ptr makeResourceInt8(Backend *backend, const MNN::Op *op, int pack=4); + static void makeResource(Backend* backend, std::shared_ptr resource, const MNN::Op *op, std::shared_ptr resourceInt8 = nullptr); + static void makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr resourceInt8); CPUConvolution(const Convolution2DCommon *convOp, Backend *b); virtual ~CPUConvolution() = default; virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp index 03767edfa..f3fdf2cb3 100644 --- a/source/backend/cpu/CPUConvolutionDepthwise.cpp +++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp @@ -265,7 +265,7 @@ class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator { int originBiasSize = 0; std::shared_ptr quanCommon; if (nullptr != conv2d->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2d, backend, true); + quanCommon = ConvolutionCommon::load(op, backend, true); // Back to float originWeight = quanCommon->weightFloat.get(); originWeightSize = quanCommon->weightFloat.size(); diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp index 0a1e6f813..0364ad58e 100644 --- a/source/backend/cpu/CPUDeconvolution.cpp +++ b/source/backend/cpu/CPUDeconvolution.cpp @@ -173,13 +173,13 @@ CPUDeconvolution::CPUDeconvolution(const Tensor* input, const Op* convOp, Backen auto biasPtr = _bias.data(); auto scalePtr = _scale.data(); auto betaPtr = _beta.data(); - + if (ModeInt8) { - ConvolutionCommon::getConvInt8Parameters(conv2d, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr); + ConvolutionCommon::getConvInt8Parameters(convOp, quanCommon, backend, quanWeightInt8, tempWeightSize, scalePtr, biasPtr, betaPtr); } else { - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2d, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, convOp, &tempWeight, &tempWeightSize); } - + bool success = backend->onAcquireBuffer(mWeight.get(), Backend::STATIC) && backend->onAcquireBuffer(cache.get(), Backend::STATIC); if (!success) { @@ -299,7 +299,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c //int zeroPoint = 0; auto biasTensor = inputs[2]; - + // prepare for float2int8 if necessary. auto outputQuant = TensorUtils::getQuantInfo(outputs[0]); float scale = outputQuant[0]; @@ -333,7 +333,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector& inputs, c } mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth)); // tempInput->buffer().host = (uint8_t*)inputPtr; - + needReleaseTempInput = false; TensorUtils::getDescribeOrigin(tempInput.get())->mem = new CPUMemObj(nullptr, TensorUtils::getDescribeOrigin(input)->mem->chunk(), 0); mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()}); diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp index c9e0427f0..ed932e0b4 100644 --- a/source/backend/cpu/CPUDeconvolution.hpp +++ b/source/backend/cpu/CPUDeconvolution.hpp @@ -45,17 +45,18 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic { auto conv2d = convOp->main_as_Convolution2D(); auto common = conv2d->common(); auto pack = static_cast(b)->functions()->pack; - mResource = CPUConvolution::makeResourceInt8(backend(), conv2d, pack); + mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack); CPUConvolution::MutableResourceInt8 mutableResource(mResource, b); auto core = static_cast(b)->int8Functions(); auto gemmKernel = core->Int8GemmKernel; int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY(); - const int ocDiv4 = UP_DIV(common->outputCount(), UNIT) * kEleCnt; + const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt; const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT); + const int ocDivUnit = UP_DIV(common->outputCount(), UNIT); const int oc4 = ocDiv4 / kEleCnt; - const int bias_elesize = ocDiv4 * UNIT; + const int bias_elesize = ocDiv4 * pack; // set offset if use SSE. auto inputQuant = TensorUtils::getQuantInfo(input); auto inputZeroPoint = inputQuant[1]; @@ -66,7 +67,7 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic { gemmKernel = core->Int8GemmKernelFast; } for (int a = 0; a < kEleCnt; ++a){ - for (int oz = 0; oz < oc4 * UNIT; ++oz) { + for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) { int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT; for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) { int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT; @@ -74,7 +75,9 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic { auto weightInt8Data = weightDataPtr[index]; offset += weightInt8Data * (-128); } - _bias[a * oc4 * UNIT + oz] = offset; + if (oz < oc4 * pack) { + _bias[a * oc4 * pack + oz] = offset; + } } } #else @@ -82,7 +85,7 @@ class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic { gemmKernel = core->Int8GemmKernelFast; } #endif - mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, conv2d, gemmKernel, _bias)); + mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias)); } } virtual ~CPUDeconvolutionOrigin() = default; diff --git a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp index 768abbad0..4e1b7f04e 100644 --- a/source/backend/cpu/CPUDeconvolutionDepthwise.cpp +++ b/source/backend/cpu/CPUDeconvolutionDepthwise.cpp @@ -27,7 +27,7 @@ CPUDeconvolutionDepthwise::CPUDeconvolutionDepthwise(const Tensor* input, const const float* tempWeight = nullptr; int tempWeightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, b, conv, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, b, convOp, &tempWeight, &tempWeightSize); // Reorder weight from whc -> pwhc4 int kernelSize = depthQuad * core->pack * kw * kh; diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp index 347dd4839..0df722bb4 100644 --- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp +++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp @@ -252,7 +252,7 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator { UNIT = 4; } } - auto res = CPUConvolution::makeResourceInt8(backend, convOp, UNIT); + auto res = CPUConvolution::makeResourceInt8(backend, op, UNIT); const int kernelSize = common->kernelX() * common->kernelY(); const int outputCount = common->outputCount(); const int ocDivUnit = UP_DIV(outputCount, UNIT); diff --git a/source/backend/cpu/KVCacheManager.cpp b/source/backend/cpu/KVCacheManager.cpp new file mode 100644 index 000000000..7804d3dd5 --- /dev/null +++ b/source/backend/cpu/KVCacheManager.cpp @@ -0,0 +1,467 @@ +// +// KVCacheManager.cpp +// MNN +// +// Created by MNN on 2024/08/05. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef MNN_SUPPORT_TRANSFORMER_FUSE + +#include "KVCacheManager.hpp" +#include "core/Concurrency.h" + +namespace MNN { + +// @brief Translate an address to a hex number string +static inline std::string addrToHex(void *addr) { + std::string result = ""; + uint64_t n = (uint64_t)addr; + for(int i = 15; i >= 0; i--) { + int t = (n >> (i * 4)) & 0x0f; + result.push_back((t < 10) ? ('0' + t) : ('A' + t - 10)); + } + return result; +} + +void KVCacheManager::createKVCacheFile() { + // Each layer has its own kvcache, so we have to create a key file and a value file for each layer and the file name must be unique + // Here we use the address of the mResource as the file name because the addresses of mResource in different layers are guaranteed to be different + std::string fileName = addrToHex(this); + std::string pathk = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".k"; + std::string pathv = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".v"; + mKeyCacheFD = MNNCreateFile(pathk.c_str()); + mValueCacheFD = MNNCreateFile(pathv.c_str()); + if (mKeyCacheFD == INVALID_FILE) { + MNN_PRINT("Failed to create the file: %s\n", pathk.c_str()); + } + if (mValueCacheFD == INVALID_FILE) { + MNN_PRINT("Failed to create the file: %s\n", pathv.c_str()); + } +} + +void KVCacheManager::removeKVCacheFile() { + std::string fileName = addrToHex(this); + std::string pathk = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".k"; + std::string pathv = MNNFilePathConcat(mConfig.mKVCacheDir, fileName) + ".v"; + if (mKeyCacheFD != INVALID_FILE) { + MNNCloseFile(mKeyCacheFD); + mKeyCacheFD = INVALID_FILE; + if (MNNRemoveFile(pathk.c_str()) != MNN::NO_ERROR) { + MNN_PRINT("Failed to remove the file: %s\n", pathk.c_str()); + } + } + if (mValueCacheFD != INVALID_FILE) { + MNNCloseFile(mValueCacheFD); + mValueCacheFD = INVALID_FILE; + if (MNNRemoveFile(pathv.c_str()) != MNN::NO_ERROR) { + MNN_PRINT("Failed to remove the file: %s\n", pathv.c_str()); + } + } +} + +void KVCacheManager::resetKVCacheFileSize(size_t keySize, size_t valueSize) { + if (MNNSetFileSize(mKeyCacheFD, keySize) != MNN::NO_ERROR || MNNSetFileSize(mValueCacheFD, valueSize) != MNN::NO_ERROR) { + MNN_PRINT("Failed to resize the kvcache files!\n"); + } +} + +/* +** @brief Memory-map the kvcache file +** @hint After memory-mapping, we can access the kvcache files with pointers, just like accessing memory buffer +** But the data actually resides in disk. +** The OS will set some kernel page cache and manage the data swaping, which we do not need to care. +*/ +void KVCacheManager::mmapKVCache(size_t keySize, size_t valueSize) +{ + if (mMapKeyAddr == nullptr) { + mMapKeyAddr = (char *)MNNMmapFile(mKeyCacheFD, keySize); + if (mMapKeyAddr == nullptr) { + MNN_PRINT("Failed to memory-map the kvcache!\n"); + } + } + if (mMapValueAddr == nullptr) { + mMapValueAddr = (char *)MNNMmapFile(mValueCacheFD, valueSize); + if (mMapValueAddr == nullptr) { + MNN_PRINT("Failed to memory-map the kvcache!\n"); + } + } +} + +void KVCacheManager::unmapKVCache(size_t keySize, size_t valueSize) +{ + if (mMapKeyAddr != nullptr) { + MNNUnmapFile(mMapKeyAddr, keySize); + mMapKeyAddr = nullptr; + } + if (mMapValueAddr != nullptr) { + MNNUnmapFile(mMapValueAddr, valueSize); + mMapValueAddr = nullptr; + } +} + +/* +** @brief Expand the size of kvcache and copy it from the old tensor in memory to the new tensor in memory +** Finally reset the pointer to the new tensor +*/ +void KVCacheManager::expandKVCacheInMem(int oldMaxLength) { + /*=================================== Key ===================================*/ + if (mConfig.mQuantKey) { + auto new_key = Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}); + mBackend->onAcquireBuffer(new_key, Backend::STATIC); + for (int h = 0; h < mKvNumHead; h++) { + memcpy(new_key->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP); + } + mPastKey.reset(new_key); + } + else { + auto new_key = Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}); + mBackend->onAcquireBuffer(new_key, Backend::STATIC); + for (int h = 0; h < mKvNumHead; h++) { + memcpy(new_key->host() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes); + } + mPastKey.reset(new_key); + } + /*=================================== Value ===================================*/ + if (mConfig.mQuantValue) { + auto new_value = Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}); + mBackend->onAcquireBuffer(new_value, Backend::STATIC); + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(new_value->host() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP); + } + } + mPastValue.reset(new_value); + } + else { + auto new_value = Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}); + mBackend->onAcquireBuffer(new_value, Backend::STATIC); + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(new_value->host() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes); + } + } + mPastValue.reset(new_value); + } +} + +/* +** @brief Move the kvcache from memory to the memory-mapped kvcache files in disk +** Then release the memory buffer of old kvcache +*/ +void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) { + /*=================================== Key ===================================*/ + if (mConfig.mQuantKey) { + for (int h = 0; h < mKvNumHead; h++) { + memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP); + } + mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC); + mPastKey.reset(); + } + else { + for (int h = 0; h < mKvNumHead; h++) { + memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes); + } + mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC); + mPastKey.reset(); + } + /*=================================== Value ===================================*/ + if (mConfig.mQuantValue) { + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP); + } + } + mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC); + mPastValue.reset(); + } + else { + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes); + } + } + mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC); + mPastValue.reset(); + } +} + +/* +** @brief Expand the size of kvcache files in disk +*/ +void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) { + size_t oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + size_t keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + // Step 1: Copy the old kvcache from files to temporary buffers in memory + std::shared_ptr old_key, old_value; + if (mConfig.mQuantKey) { + old_key.reset(Tensor::createDevice({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP})); + } else { + old_key.reset(Tensor::createDevice({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP})); + } + if (mConfig.mQuantValue) { + old_value.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), oldMaxLength, hP})); + } else { + old_value.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), oldMaxLength, hP})); + } + mBackend->onAcquireBuffer(old_key.get(), Backend::STATIC); + mBackend->onAcquireBuffer(old_value.get(), Backend::STATIC); + mmapKVCache(oldKeySize, oldValueSize); + memcpy(old_key->host(), mMapKeyAddr, oldKeySize); + memcpy(old_value->host(), mMapValueAddr, oldValueSize); + // Step 2: Resize the kvcache files and remap them + unmapKVCache(oldKeySize, oldValueSize); + resetKVCacheFileSize(keySize, valueSize); + mmapKVCache(keySize, valueSize); + // Step 3: Move the kvcache from temporary buffers in memory to disk + if (mConfig.mQuantKey) { + for (int h = 0; h < mKvNumHead; h++) { + memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, old_key->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP); + } + } else { + for (int h = 0; h < mKvNumHead; h++) { + memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, old_key->host() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes); + } + } + if (mConfig.mQuantValue) { + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, old_value->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP); + } + } + } else { + for (int h = 0; h < mKvNumHead; h++) { + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, old_value->host() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes); + } + } + } + // Step 4: Release the temporary buffers + mBackend->onReleaseBuffer(old_key.get(), Backend::STATIC); + mBackend->onReleaseBuffer(old_value.get(), Backend::STATIC); +} + +void KVCacheManager::onResize(int kv_num_head, int head_dim) { + mKvNumHead = kv_num_head; + mHeadDim = head_dim; + auto core = static_cast(mBackend)->functions(); + core->MNNGetMatMulPackMode(&eP, &lP, &hP); + mBytes = core->bytes; + mThreadNum = static_cast(mBackend)->threadNumber(); + if (mThreadNum > mKvNumHead) { + mThreadNum = mKvNumHead; + } +} + +void KVCacheManager::onAlloc(int kv_seq_len) { + mMaxLength = kv_seq_len + mConfig.mExpandChunk; + size_t keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + /*============== Put the kvcache in disk ===========*/ + if (mConfig.mKVCacheSizeLimit != -1 && keySize + valueSize > mConfig.mKVCacheSizeLimit) { + createKVCacheFile(); + resetKVCacheFileSize(keySize, valueSize); + mmapKVCache(keySize, valueSize); + mKVCacheInDisk = true; + } + /*============== Put the kvcache in memory ===========*/ + else { + if (mConfig.mQuantKey) { + mPastKey.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP})); + } else { + mPastKey.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP})); + } + if (mConfig.mQuantValue) { + mPastValue.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP})); + } else { + mPastValue.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP})); + } + mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC); + mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC); + } + /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */ + if (mConfig.mQuantKey) { + mDequantKeyScale.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP})); + mDequantKeyZeroPoint.reset(Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP})); + mBackend->onAcquireBuffer(mDequantKeyScale.get(), Backend::STATIC); + mBackend->onAcquireBuffer(mDequantKeyZeroPoint.get(), Backend::STATIC); + } +} + +void KVCacheManager::onRealloc(int kv_seq_len) { + if (kv_seq_len <= mMaxLength) { + return; + } + int oldMaxLength = mMaxLength; + mMaxLength = kv_seq_len + mConfig.mExpandChunk; + size_t oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + size_t keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + /*==== No limit for kvcache ====*/ + if (mConfig.mKVCacheSizeLimit == -1) { + expandKVCacheInMem(oldMaxLength); + } + /*==== Last time the kvcache is memory, now it should be in memory too ====*/ + else if (keySize + valueSize <= mConfig.mKVCacheSizeLimit) { + expandKVCacheInMem(oldMaxLength); + } + /*==== Last time the kvcache is in memory, but now it should be moved to disk ====*/ + else if (oldKeySize + oldValueSize <= mConfig.mKVCacheSizeLimit) { + createKVCacheFile(); + resetKVCacheFileSize(keySize, valueSize); + mmapKVCache(keySize, valueSize); + moveKVCacheFromMemToDisk(oldMaxLength); + mKVCacheInDisk = true; + } + /*==== Last time the kvcache is disk, now it should be in disk too ====*/ + else { + expandKVCacheInDisk(oldMaxLength); + } + /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */ + if (mConfig.mQuantKey) { + auto new_scale = Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}); + auto new_zeroPoint = Tensor::createDevice({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}); + mBackend->onAcquireBuffer(new_scale, Backend::STATIC); + mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC); + for (int h = 0; h < mKvNumHead; h++) { + memcpy(new_scale->host() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyScale->host() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes); + memcpy(new_zeroPoint->host() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyZeroPoint->host() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes); + } + mDequantKeyScale.reset(new_scale); + mDequantKeyZeroPoint.reset(new_zeroPoint); + } +} + +void KVCacheManager::onClear() { + if (mKVCacheInDisk) { + size_t oldKeySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + unmapKVCache(oldKeySize, oldValueSize); + removeKVCacheFile(); + mKVCacheInDisk = false; + } + else { + mPastKey.reset(); + mPastValue.reset(); + } + mMaxLength = mPastLength = 0; +} + +template +static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, + int hP, int kv_h, bool quantKey, char* scale, char* zero_point, const MNN::CoreFunctions * core) { + if (quantKey) { + int8_t * key_dst = reinterpret_cast(pack_key); + T * scale_dst = reinterpret_cast(scale); + T * zeroPoint_dst = reinterpret_cast(zero_point); + for (int i = 0; i < seq_len; i++) { + T * key_src = key->host() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim; + int out_index = (mPastLength + i) / hP; + int in_index = (mPastLength + i) % hP; + T minKey, maxKey; + core->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim); + scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f; + zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey; + for (int j = 0; j < mHeadDim; j++) { + key_dst[out_index * mHeadDim * hP + j * hP + in_index] = roundf((key_src[j] - minKey) / (maxKey - minKey) * 255 - 128); + } + } + } + else { + T * key_dst = reinterpret_cast(pack_key); + for (int i = 0; i < seq_len; i++) { + T * key_src = key->host() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim; + int out_index = (mPastLength + i) / hP; + int in_index = (mPastLength + i) % hP; + for (int j = 0; j < mHeadDim; j++) { + key_dst[out_index * mHeadDim * hP + j * hP + in_index] = key_src[j]; + } + } + } +} + +template +static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quantValue, const MNN::CoreFunctions * core) { + if (quantValue) { + fp8_t * value_dst = reinterpret_cast(pack_value); + uint8_t * buf = (uint8_t *)MNNMemoryAllocAlign(mHeadDim, MNN_MEMORY_ALIGN_DEFAULT); + for (int i = 0; i < seq_len; i++) { + T * value_src = value->host() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim; + if (sizeof(T) == 2) { + core->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim); + } else { + core->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim); + } + for (int j = 0; j < mHeadDim; j++) { + int out_index = j / hP; + int in_index = j % hP; + value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = buf[j]; + } + } + MNNMemoryFreeAlign(buf); + } + else { + T * value_dst = reinterpret_cast(pack_value); + for (int i = 0; i < seq_len; i++) { + T * value_src = value->host() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim; + for (int j = 0; j < mHeadDim; j++) { + int out_index = j / hP; + int in_index = j % hP; + value_dst[out_index * mMaxLength * hP + (mPastLength + i) * hP + in_index] = value_src[j]; + } + } + } +} + +void KVCacheManager::onPushBack(const Tensor * key, const Tensor * value) { + auto core = static_cast(mBackend)->functions(); + int seq_len = key->shape()[1]; + int tileCount = UP_DIV(mKvNumHead, mThreadNum); + std::function packKV = [=](int tid) { + for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) { + if (mBytes == 2) { + pack_key(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core); + pack_value(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core); + } else { + pack_key(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core); + pack_value(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core); + } + } + }; + MNN_CONCURRENCY_BEGIN(tid, mThreadNum) { + packKV((int)tid); + } + MNN_CONCURRENCY_END(); + mPastLength += seq_len; +} + +void KVCacheManager::onDequantValue(Tensor * dequantedValues) { + auto core = static_cast(mBackend)->functions(); + int tileCount = UP_DIV(mKvNumHead, mThreadNum); + std::function dequant = [=](int tid) { + for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) { + char * dst = dequantedValues->host() + kv_h * UP_DIV(mHeadDim, hP) * mPastLength * hP * mBytes; + char * src = addrOfValue(kv_h); + for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) { + if (mBytes == 2) { + core->MNNFp8ToFp16((uint16_t*)dst, (uint8_t*)src, mPastLength * hP); + } else { + core->MNNFp8ToFp32((float*)dst, (uint8_t*)src, mPastLength * hP); + } + dst += mPastLength * hP * mBytes; + src += mMaxLength * hP; + } + } + }; + MNN_CONCURRENCY_BEGIN(tid, mThreadNum) { + dequant((int)tid); + } + MNN_CONCURRENCY_END(); +} + +} // namespace MNN + +#endif // MNN_SUPPORT_TRANSFORMER_FUSE \ No newline at end of file diff --git a/source/backend/cpu/KVCacheManager.hpp b/source/backend/cpu/KVCacheManager.hpp new file mode 100644 index 000000000..582481990 --- /dev/null +++ b/source/backend/cpu/KVCacheManager.hpp @@ -0,0 +1,129 @@ +// +// KVCacheManager.hpp +// MNN +// +// Created by MNN on 2024/08/05. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef MNN_SUPPORT_TRANSFORMER_FUSE + +#ifndef KVCACHE_MANAGER_HPP +#define KVCACHE_MANAGER_HPP + +#include "core/Macro.h" +#include "core/MNNFileUtils.h" +#include "backend/cpu/CPUBackend.hpp" +#include "backend/cpu/compute/CommonOptFunction.h" + +#if defined (__aarch64__) +#define FLOAT16_T __fp16 +#else +#define FLOAT16_T float +#endif + +typedef uint8_t fp8_t; + +namespace MNN { + +class KVCacheManager : public NonCopyable{ +public: + struct KVCacheConfig { + bool mQuantKey = false; // Quantize keys to int8 or not + bool mQuantValue = false; // Quantize values to fp8 or not + std::string mKVCacheDir = "/tmp"; // Path of the kvcache files in disk + size_t mKVCacheSizeLimit = -1; // The limit of the kvcache size + int mExpandChunk = 64; // Number of expand chunks when the buffer is full + }; +private: + Backend * mBackend; + KVCacheConfig mConfig; + std::shared_ptr mPastKey; // numhead, [maxlen/eP, headdim, eP] + std::shared_ptr mPastValue; // numhead, [headdim/eP, maxlen, eP] + std::shared_ptr mDequantKeyScale; // numhead, [maxlen/eP, 1, eP] + std::shared_ptr mDequantKeyZeroPoint; // numhead, [maxlen/eP, 1, eP] + file_t mKeyCacheFD = INVALID_FILE; // The file descriptor of keys + file_t mValueCacheFD = INVALID_FILE; // The file descriptor of values + char * mMapKeyAddr = nullptr; // Memory-mapped address of keys + char * mMapValueAddr = nullptr; // Memory-mapped address of values + bool mKVCacheInDisk = false; // Whether the kvcache is in disk or in memory now + int mPastLength = 0; // Length of past kvcache + int mMaxLength = 0; // Capacity of current kvcache buffer (how many kv items can be stored at most) + int eP, lP, hP, mBytes, mThreadNum; + int mKvNumHead = 0, mHeadDim = 0; + void createKVCacheFile(); + void removeKVCacheFile(); + void resetKVCacheFileSize(size_t keySize, size_t valueSize); + void mmapKVCache(size_t keySize, size_t valueSize); + void unmapKVCache(size_t keySize, size_t valueSize); + void expandKVCacheInMem(int oldMaxLength); + void moveKVCacheFromMemToDisk(int oldMaxLength); + void expandKVCacheInDisk(int oldMaxLength); +public: + KVCacheManager(Backend * backend, KVCacheConfig & kvConfig) { + mBackend = backend; + mConfig = kvConfig; + } + ~KVCacheManager() { + onClear(); + } + const Backend * backend() { + return mBackend; + } + const KVCacheConfig * config() { + return &mConfig; + } + const Tensor * key() { + return mPastKey.get(); + } + const Tensor * value() { + return mPastValue.get(); + } + const Tensor * scale() { + return mDequantKeyScale.get(); + } + const Tensor * zeroPoint() { + return mDequantKeyZeroPoint.get(); + } + bool inDisk() { + return mKVCacheInDisk; + } + int kvLength() { + return mPastLength; + } + int maxLength() { + return mMaxLength; + } + char * addrOfKey(int kv_h) { + char * baseAddr = mKVCacheInDisk ? mMapKeyAddr : mPastKey->host(); + return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes); + } + char * addrOfValue(int kv_h) { + char * baseAddr = mKVCacheInDisk ? mMapValueAddr : mPastValue->host(); + return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes); + } + char * addrOfScale(int kv_h) { + if (mConfig.mQuantKey == false) + return nullptr; + char * baseAddr = mDequantKeyScale->host(); + return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes; + } + char * addrOfZeroPoint(int kv_h) { + if (mConfig.mQuantKey == false) + return nullptr; + char * baseAddr = mDequantKeyZeroPoint->host(); + return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes; + } + void onResize(int kv_num_head, int head_dim); + void onAlloc(int kv_seq_len); + void onRealloc(int kv_seq_len); + void onClear(); + void onPushBack(const Tensor * key, const Tensor * value); + void onDequantValue(Tensor * dequantedValues); +}; + +} // namespace MNN + +#endif // KVCACHE_MANAGER_HPP + +#endif // MNN_SUPPORT_TRANSFORMER_FUSE \ No newline at end of file diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S index 6368937de..f7988025b 100644 --- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S +++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S @@ -67,7 +67,7 @@ ldr r12, [r6, #36] // f32minmax str r12, [sp, #12] ldr r12, [r6, #40] // blockNum mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum -lsl r12, r12, #6 // weight_stride = src_depth_quad*LP*HP +lsl r12, r12, #5 // weight_stride = src_depth_quad*LP*HP str r12, [sp, #16] ldr r12, [r6, #48] // extraScale str r12, [sp, #20] @@ -198,8 +198,8 @@ L2LoopDz: // vaddq.s32 q0, q8, q4 // add bias // vaddq.s32 q1, q9, q4 - vcvt.f32.s32 q0, q0 - vcvt.f32.s32 q1, q1 + vcvt.f32.s32 q0, q8 + vcvt.f32.s32 q1, q9 vmulq.f32 q0, q0, q5 // mul scale vmulq.f32 q1, q1, q5 @@ -224,6 +224,19 @@ L2LoopDz: vmla.f32 q0, q7, d12[0] vmla.f32 q1, q7, d12[1] + L2_ADD_BIAS: + cmp lr, #0 + beq L2_ADD_DSTV + vld1.f32 {q4}, [lr]! // bias + vadd.f32 q0, q0, q4 // bias + vadd.f32 q1, q1, q4 + b L2_POST + + L2_ADD_DSTV: + vld1.f32 {q4, q5}, [r0] + vadd.f32 q0, q0, q4 + vadd.f32 q1, q1, q5 + L2_POST: ldr r6, [sp, #12] // fp32 minmax cmp r6, #0 @@ -334,7 +347,7 @@ L1LoopDz: vpadd.s32 d17, d20, d22 // vaddq.s32 q0, q8, q4 - vcvt.f32.s32 q0, q0 + vcvt.f32.s32 q0, q8 vmulq.f32 q0, q0, q5 // extra scale if has ldr r6, [sp, #20] diff --git a/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S b/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S index 7c77c0fc2..9798b29bf 100644 --- a/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S +++ b/source/backend/cpu/arm/arm64/MNNAvgPoolInt8.S @@ -22,10 +22,10 @@ asm_function MNNAvgPoolInt8 ldr x8, [sp, #0] dup v24.4s, w8 -sub sp, sp, #32 -str x19, [sp, #0] -str x20, [sp, #8] -str x21, [sp, #16] +stp d14, d15, [sp, #(-16 * 4)]! +stp d12, d13, [sp, #16] +stp d10, d11, [sp, #32] +stp d8, d9, [sp, #48] cmp x4, #0 ble END @@ -206,10 +206,10 @@ cmp x2, #0 beq END END: -ldr x19, [sp, #0] -ldr x20, [sp, #8] -ldr x21, [sp, #16] -add sp, sp, #32 +ldp d8, d9, [sp, #48] +ldp d10, d11, [sp, #32] +ldp d12, d13, [sp, #16] +ldp d14, d15, [sp], #(16 * 4) ret diff --git a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S index 0e31ad489..0027d0b75 100644 --- a/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S +++ b/source/backend/cpu/arm/arm64/MNNBilinearLineC8.S @@ -28,8 +28,11 @@ dup v31.4s, v31.s[0] // v31: df fmov s30, #1.0 // v30: sf=1-df fsub s30, s30, s31 movi v1.4s, #128 // s1=128 +scvtf v1.4s, v1.4s fmul s31, s31, s1 fmul s30, s30, s1 +fcvtas v30.4s, v30.4s +fcvtas v31.4s, v31.4s dup v31.8h, v31.h[0] dup v30.8h, v30.h[0] diff --git a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S index 85cf65bbf..70227c97e 100644 --- a/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S +++ b/source/backend/cpu/arm/arm64/MNNBilinearSampleC8.S @@ -27,6 +27,7 @@ stp x19, x20, [sp, #(16 * 6)] mov w15, #8 // w15: pack uxtw x15, w15 movi v14.4s, #128 +scvtf v14.4s, v14.4s cmp x5, #0 beq END @@ -44,6 +45,9 @@ fsub v23.4s, v23.4s, v22.4s // v23: 1-factor fmul v23.4s, v23.4s, v14.s[0] fmul v22.4s, v22.4s, v14.s[0] +fcvtas v22.4s, v22.4s +fcvtas v23.4s, v23.4s + dup v30.8b, v23.b[0] // v30: sf0 dup v31.8b, v22.b[0] // v31: df0 dup v28.8b, v23.b[4] // v28: sf1 @@ -183,7 +187,7 @@ beq END L1Loop: ld1 {v31.s}[0], [x3], #4 -dup v31.4s, v31.s[0] +dup v31.4s, v31.s[0] fmov s30, #1.0 fsub s30, s30, s31 fmul s30, s30, s14 // (float)t -> (int16)t diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S index eda852364..0225e0b4e 100644 --- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S +++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S @@ -870,6 +870,7 @@ Tile8End: add x0, x0, x21, LSL #3 add x1, x1, #64 add x27, x27, #32 + cbnz w23, TILE_4 add x4, x4, #64 // Revert x4 for following tile. TILE_4: diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp index 897f10b40..d806e0cb9 100644 --- a/source/backend/cpu/compute/CommonOptFunction.cpp +++ b/source/backend/cpu/compute/CommonOptFunction.cpp @@ -3238,12 +3238,94 @@ static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions } } +// fp32 <--> fp8 +static const int FP32_EXP_BIAS = 127; +static const int FP8_EXP_BIAS = 24; // [0, 31] --> [-24, 7] --> [1 / 2^24, 2^7] +void MNNFp32ToFp8(uint8_t* dst, const float* src, size_t size) { + for (int i = 0; i < size; i++) { + uint32_t rawData = *((uint32_t *)(&src[i])); + uint32_t sign = (rawData >> 31) & 1U; + uint32_t exp = (int)((rawData >> 23) & 0x0ffU); + uint32_t mant = (rawData >> 21) & 3U; + int realExp = (int)exp - FP32_EXP_BIAS; + realExp = ALIMAX(realExp, 0 - FP8_EXP_BIAS); + realExp = ALIMIN(realExp, 31 - FP8_EXP_BIAS); + exp = (uint32_t)(realExp + FP8_EXP_BIAS); + dst[i] = (int8_t)((sign << 7) | (exp << 2) | mant); + } +} +void MNNFp8ToFp32(float* dst, const uint8_t* src, size_t size) { + for (int i = 0; i < size; i++) { + uint32_t sign = (src[i] >> 7) & 1U; + uint32_t exp = (int)((src[i] >> 2) & 0x1fU); + uint32_t mant = (src[i] & 3U) << 21; + int realExp = (int)exp - FP8_EXP_BIAS; + exp = (uint32_t)(realExp + FP32_EXP_BIAS); + uint32_t rawData = (sign << 31) | (exp << 23) | mant; + dst[i] = *((float *)(&rawData)); + } +} +// fp16 <--> fp8 +void MNNFp16ToFp8(uint8_t* dst, const uint16_t* src, size_t size) { +#ifdef MNN_USE_NEON +#ifdef __aarch64__ + int loopN = size / 16; + for (int i = 0; i < loopN; i++) { + uint8x16_t v1 = vld1q_u8((uint8_t*)(src + i * 16)); + uint8x16_t v2 = vld1q_u8((uint8_t*)(src + i * 16 + 8)); + uint8x16_t res = vuzp2q_u8(v1, v2); + vst1q_u8(dst + i * 16, res); + } + for (int i = loopN * 16; i < size; i++) { + dst[i] = static_cast(src[i] >> 8); + } +#else + int loopN = size / 8; + for (int i = 0; i < loopN; i++) { + uint16x8_t vec = vld1q_u16(src + i * 8); + uint8x8_t res = vshrn_n_u16(vec, 8); + vst1_u8(dst + i * 8, res); + } + for (int i = loopN * 8; i < size; i++) { + dst[i] = static_cast(src[i] >> 8); + } +#endif // ARM64 +#else + for (int i = 0; i < size; i++) { + dst[i] = static_cast(src[i] >> 8); + } +#endif // USE_NEON +} +void MNNFp8ToFp16(uint16_t* dst, const uint8_t* src, size_t size) { +#ifdef MNN_USE_NEON + int loopN = size / 8; + for (int i = 0; i < loopN; i++) { + uint8x8_t vec8x8 = vld1_u8(src + i * 8); + uint16x8_t vec16x8 = vshll_n_u8(vec8x8, 8); + vst1q_u16(dst + i * 8, vec16x8); + } + for (int i = loopN * 8; i < size; i++) { + dst[i] = static_cast(src[i]) << 8; + } +#else + for (int i = 0; i < size; i++) { + dst[i] = static_cast(src[i]) << 8; + } +#endif // USE_NEON +} + namespace MNN { static CoreFunctions* gCoreFunction = nullptr; void MNNCoreFunctionInit() { gCoreFunction = new CoreFunctions; + // fp8 + gCoreFunction->MNNFp32ToFp8 = MNNFp32ToFp8; + gCoreFunction->MNNFp16ToFp8 = MNNFp16ToFp8; + gCoreFunction->MNNFp8ToFp32 = MNNFp8ToFp32; + gCoreFunction->MNNFp8ToFp16 = MNNFp8ToFp16; + // MatMul gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode; gCoreFunction->MNNPackC4ForMatMul_A = MNNPackC4ForMatMul_A; @@ -3426,4 +3508,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth areaOffset, }; MNNPackInt8C2(dst, src, area, depth, offset); -} +} \ No newline at end of file diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h index bbfdce0fa..4af1a81a8 100644 --- a/source/backend/cpu/compute/CommonOptFunction.h +++ b/source/backend/cpu/compute/CommonOptFunction.h @@ -20,6 +20,11 @@ extern "C" { +void MNNFp32ToFp8(uint8_t* dst, const float* src, size_t size); +void MNNFp8ToFp32(float* dst, const uint8_t* src, size_t size); +void MNNFp16ToFp8(uint8_t* dst, const uint16_t* src, size_t size); +void MNNFp8ToFp16(uint16_t* dst, const uint8_t* src, size_t size); + void MNNReluWithSlope(float* dst, const float* src, size_t sizeQuad, float slope); void MNNReluInt8(int8_t* dst, const int8_t* src, size_t size, ssize_t zeroPoint); @@ -190,6 +195,12 @@ constexpr int InputTileMax = 14; // same value from DynamicGemm.h, cannot includ namespace MNN { struct CoreFunctions { + // fp8 + void (*MNNFp32ToFp8)(uint8_t* dst, const float* src, size_t size); + void (*MNNFp16ToFp8)(uint8_t* dst, const uint16_t* src, size_t size); + void (*MNNFp8ToFp32)(float* dst, const uint8_t* src, size_t size); + void (*MNNFp8ToFp16)(uint16_t* dst, const uint8_t* src, size_t size); + // cpu feature bool supportFp16arith = false; bool supportSDot = false; diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp index 6471acb3a..25fb13a8f 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp @@ -18,8 +18,11 @@ namespace MNN { -ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res): CPUConvolution(convOp->common(), backend), mResourceInt8(res), mMutableResource(res, backend) { - mValid = mMutableResource.mValid; +ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op): CPUConvolution(op->main_as_Convolution2D()->common(), backend) {} + +ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res): CPUConvolution(op->main_as_Convolution2D()->common(), backend), mResourceInt8(res) { + mMutableResource.reset(new MutableResourceInt8(res, backend)); + mValid = mMutableResource->mValid; } ConvInt8TiledExecutor::~ConvInt8TiledExecutor() { @@ -31,7 +34,7 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst) } ErrorCode ConvInt8TiledExecutor::onResize(const std::vector& inputs, const std::vector& outputs) { - mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0])); + mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0])); CPUConvolution::onResize(inputs, outputs); ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast(backend())->functions(), static_cast(backend())->int8Functions()); return NO_ERROR; @@ -99,7 +102,7 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common, pack = 4; } if (SRC_UNIT > pack) { - MNN_ASSERT(SRC_UNIT % UNIT == 0); + MNN_ASSERT(SRC_UNIT % pack == 0); shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT}; } else { shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; @@ -116,13 +119,13 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common, return true; } -static void Getfp32Info (std::shared_ptr resource, std::shared_ptr weightOrigin, const Convolution2D* conv2d, std::shared_ptr quantCommon) { +static void GetResourceInt8(std::shared_ptr resource, std::shared_ptr quantCommon, const Convolution2D* conv2d, Backend* backend) { // common parameters int outputCount = conv2d->common()->outputCount(); - auto core = static_cast(resource->backend)->functions(); + auto core = static_cast(backend)->functions(); int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY(); int ocUp4 = ROUND_UP(outputCount, core->pack); - + int dequantCnt = quantCommon->alpha.size(); if (quantCommon->asymmetric) { dequantCnt /= 2; @@ -131,19 +134,33 @@ static void Getfp32Info (std::shared_ptr resource, std int scaleSize = blockNum * ocUp4; // pack size. int blockSize = LSize / blockNum; int originOffset = 0; + resource->mActBits = 8; if (quantCommon->canUseInt4) { originOffset = -8; + resource->mActBits = 4; } - - // Save weight quant scale and bias: wf=scale*wi+bias + // Save bias + resource->mOriginBias.reset(Tensor::createDevice({ocUp4})); // float + auto success = backend->onAcquireBuffer(resource->mOriginBias.get(), Backend::STATIC); + if (!success) { + MNN_ERROR("Alloc bias memory error\n"); + return; + } + ::memset(resource->mOriginBias->host(), 0, ocUp4 * sizeof(float)); + if (conv2d->bias()) { + ::memcpy(resource->mOriginBias->host(), conv2d->bias()->data(), outputCount * sizeof(float)); + } else { + ::memset(resource->mOriginBias->host(), 0, ocUp4 * sizeof(float)); + } + // Save weight quant alpha and zero: wf=alpha*wi+zero int bytes = 4; - resource->mDequantize.mScaleBias.reset(Tensor::createDevice({2 * scaleSize * bytes})); - auto success = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC); + resource->mOriginScale.reset(Tensor::createDevice({2 * scaleSize * bytes})); + success = backend->onAcquireBuffer(resource->mOriginScale.get(), Backend::STATIC); if (!success) { - MNN_ERROR("Alloc denquant scaleBias memory error\n"); + MNN_ERROR("Alloc denquant alpha, zero memory error\n"); return; } - auto alphaPtr = resource->mDequantize.mScaleBias->host(); + auto alphaPtr = resource->mOriginScale->host(); auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + scaleSize * bytes); ::memset(alphaPtr, 1, scaleSize * bytes); ::memset(biasPtr, 0, scaleSize * bytes); @@ -159,7 +176,6 @@ static void Getfp32Info (std::shared_ptr resource, std dstBias[j] = quanInfoPtr[2 * scaleIndex] + (float)originOffset * dstAlpha[j]; } } - } else { for (int i = 0; i < blockNum; ++i) { auto dstAlpha = alphaPtr + i * ocUp4; @@ -173,13 +189,13 @@ static void Getfp32Info (std::shared_ptr resource, std } // Save float weight kernel sum resource->mWeightKernelSum.reset(Tensor::createDevice({bytes * ocUp4})); - success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC); + success = backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC); if (!success) { - MNN_ERROR("Alloc denquant mWeightKernelSum memory error\n"); + MNN_ERROR("Alloc denquant weight kernel sum memory error\n"); return; } auto weightKernelSum = resource->mWeightKernelSum->host(); - auto realWeightData = weightOrigin->host(); + auto realWeightData = quantCommon->weight.get(); ::memset(weightKernelSum, 0, resource->mWeightKernelSum->size()); for (int j = 0; j < outputCount; ++j) { float sum = 0.f; @@ -195,116 +211,193 @@ static void Getfp32Info (std::shared_ptr resource, std bias = 0; } int tmp = 0; - for (int i = 0; i < blockSize; ++i) { - int l_index = k * blockSize + i; - tmp += (int)realWeightData[j * blockNum * blockSize + l_index]; + if (quantCommon->canUseInt4) { + for (int i = 0; i < blockSize; ++i) { + int l_index = k * blockSize + i; + int w_idx = (j * blockNum * blockSize + l_index); + int w_offset = w_idx / 2; + int w_mask = w_idx % 2; + uint8_t s = realWeightData[w_offset]; + int val = w_idx % 2 ? s & 0x0f : s >> 4; + tmp += (val - 8); + } + } else { + for (int i = 0; i < blockSize; ++i) { + int l_index = k * blockSize + i; + tmp += (int)realWeightData[j * blockNum * blockSize + l_index]; + } } + sum += (tmp * scale + blockSize * bias); } weightKernelSum[j] = sum; } } -DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res, bool dynamicQuantExe) : ConvInt8TiledExecutor(backend, convOp, res) { - std::shared_ptr weightOrigin = mResourceInt8->mWeightInt8; - std::shared_ptr quanCommon ; - mDynamicQuantExe = dynamicQuantExe; - if (dynamicQuantExe) { - MNN_ASSERT(convOp->quanParameter() != nullptr && convOp->quanParameter()->buffer() != nullptr); - quanCommon = ConvolutionCommon::load(convOp, backend, false, true); - // fp32 weightKernelSum - mResource.reset(new CPUConvolution::Resource); - mResource->backend = backend; - Getfp32Info(mResource, weightOrigin, convOp, quanCommon); // Call this before reorder weight. - } - - mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8); - if(!mValid) { - return; - } +DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr quanCommon) : mDynamicQuantExe(true), ConvInt8TiledExecutor(backend, op) { + auto convOp = op->main_as_Convolution2D(); auto core = static_cast(backend)->int8Functions(); auto gcore = static_cast(backend)->functions(); - // offline quant - if (false == dynamicQuantExe) { - mGemmKernel = core->Int8GemmKernel; -#ifdef MNN_USE_SSE - int actBits = convOp->symmetricQuan()->nbits(); - if (actBits <= 7) { - mGemmKernel = core->Int8GemmKernelFast; - } -#else - if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){ - mGemmKernel = core->Int8GemmKernelFast; - } -#endif - mResource.reset(new CPUConvolution::Resource); - CPUConvolution::makeResource(backend, mResource, convOp, mResourceInt8); - return; - } - + mResourceInt8.reset(new CPUConvolution::ResourceInt8); + GetResourceInt8(mResourceInt8, quanCommon, convOp, backend); + mMutableResource.reset(new MutableResourceInt8(mResourceInt8, backend)); // dynamic quant int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - bool needPermuteInt4weight = ((UNIT == 8 && SRC_UNIT == 8 && DST_XUNIT ==10) || (UNIT == 4 && SRC_UNIT == 8 && DST_XUNIT ==20) || (UNIT == 64 && SRC_UNIT == 4 && DST_XUNIT ==4)); - mResource->mDequantize.bits = 8; - if (quanCommon->canUseInt4) { + int pack = gcore->pack; + bool needPermuteInt4weight = ((UNIT == 8 && SRC_UNIT == 8 && DST_XUNIT ==10) || (UNIT == 64 && SRC_UNIT == 4 && DST_XUNIT ==4)); + auto weightLength = quanCommon->weight.size(); + int kernelCount = mCommon->kernelX() * mCommon->kernelY(); + int oc = convOp->common()->outputCount(); + int ic = convOp->common()->inputCount(); + bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic); + if (quanCommon->canUseInt4 && directReadInt4weight) { + // int4 weight reorder mResourceInt8->mWeightAsymmetricQuant = true; - auto weightLength = mResourceInt8->mWeightInt8->size(); - MNN_ASSERT(weightLength % 2 == 0); - mResource->mDequantize.bits = 4; - std::shared_ptr weightLow(Tensor::createDevice( mResourceInt8->mWeightInt8->shape())); - auto res = mResource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC); + // shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; + int hU = UP_DIV(oc, UNIT); + int lU = UP_DIV(ic, SRC_UNIT); + int hP = UNIT; + int lP = SRC_UNIT; + + // weight shape. + std::vector shape; + if (SRC_UNIT > pack) { + MNN_ASSERT(SRC_UNIT % pack == 0); + shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT}; + } else { + shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; + } + mResourceInt8->mWeightInt8.reset(Tensor::createDevice(shape)); + + auto res = backend->onAcquireBuffer(mResourceInt8->mWeightInt8.get(), Backend::STATIC); if (!res) { MNN_ERROR("int4 weight acquire buffer error\n"); return ; } - auto srcPtr = mResourceInt8->mWeightInt8->host(); - auto dstPtr = weightLow->host(); + auto srcPtr = (uint8_t*)quanCommon->weight.get(); + auto dstPtr = mResourceInt8->mWeightInt8->host(); + ::memset(dstPtr, 0, mResourceInt8->mWeightInt8->size()); + // Pack two int4-weight to one int8-weight. if (false == needPermuteInt4weight) { - weightLength = UP_DIV(weightLength, 2); - for (int i=0; i < weightLength; ++i) { - int s0 = srcPtr[2 * i + 0]; - int s1 = srcPtr[2 * i + 1]; - int d = (s0 + 8) * 16 + (s1 + 8); - dstPtr[i] = d; + for (int i = 0; i < hU; i++) { + for (int j = 0; j < lU; j++) { + for (int k = 0; k < hP; k++) { + for (int id = 0; id < lP / 2; ++id) { + dstPtr[(i * lU * lP * hP + j * hP * lP + k * lP) / 2 + id] = srcPtr[((i * hP + k) * lP * lU + (j * lP)) / 2 + id]; + } + } + } } } else { - int permuteUnit = UNIT * SRC_UNIT; - int halfPermuteStride = static_cast(permuteUnit / 2); - for (int i = 0; i < weightLength / permuteUnit; ++i) { - auto src0 = srcPtr + i * permuteUnit; - auto dst0 = dstPtr + i * halfPermuteStride; - for (int j = 0; j < halfPermuteStride; ++j) { - int s0 = src0[j]; - int s1 = src0[j + halfPermuteStride]; + for (int i = 0; i < hU; i++) { + for (int j = 0; j < lU; j++) { + auto dst_ptr = dstPtr + (i * lU * lP * hP + j * hP * lP) / 2; + for (int k = 0; k < 16; k++) { + int col = k % 4; + int row = k / 4; + uint8_t s0 = srcPtr[((i * hP + row + 0) * lP * lU + j * lP) / 2 + col]; + uint8_t s1 = srcPtr[((i * hP + row + 4) * lP * lU + j * lP) / 2 + col]; + uint8_t d0 = (s0 & 0xf0) | (s1 >> 4); + uint8_t d1 = (s0 << 4) | (s1 & 0x0f); + dst_ptr[k * 2 + 0] = d0; + dst_ptr[k * 2 + 1] = d1; + } + } + } + } + } else { + // std::shared_ptr srcWeight; + + if (quanCommon->canUseInt4) { + mResourceInt8->mWeightAsymmetricQuant = true; + auto srcPtr = reinterpret_cast(quanCommon->weight.get()); + std::vector tmpWeight(weightLength * 2, 0); + for (int i = 0; i < weightLength; ++i) { + int8_t s0 = (srcPtr[i] >> 4) - 8; // For int4 quant weight, +8 saved in quant buffer + int8_t s1 = (srcPtr[i] & 0x0f) - 8; + tmpWeight[2 * i + 0] = s0; + tmpWeight[2 * i + 1] = s1; + } + std::shared_ptr srcWeight(Tensor::create({weightLength * 2}, (void*)tmpWeight.data())); + mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8); + if(!mValid) { + return; + } + MNN_ASSERT(mResourceInt8->mWeightInt8->size() % 2 == 0); + int leng = mResourceInt8->mWeightInt8->size(); + int halflen = leng / 2; + std::shared_ptr weightLow(Tensor::create({halflen})); + auto dstint4Ptr = weightLow->host(); + auto srcint4Ptr = mResourceInt8->mWeightInt8->host(); + if (false == needPermuteInt4weight) { + for (int i=0; i < halflen; ++i) { + int s0 = srcint4Ptr[2 * i + 0]; + int s1 = srcint4Ptr[2 * i + 1]; int d = (s0 + 8) * 16 + (s1 + 8); - dst0[j] = d; + dstint4Ptr[i] = d; } + } else { + int permuteUnit = UNIT * SRC_UNIT; + int halfPermuteStride = static_cast(permuteUnit / 2); + for (int i = 0; i < leng / permuteUnit; ++i) { + auto src0 = srcint4Ptr + i * permuteUnit; + auto dst0 = dstint4Ptr + i * halfPermuteStride; + for (int j = 0; j < halfPermuteStride; ++j) { + int s0 = src0[j]; + int s1 = src0[j + halfPermuteStride]; + int d = (s0 + 8) * 16 + (s1 + 8); + dst0[j] = d; + } + } + } + // Update int4 weight to mWeightInt8. + mResourceInt8->mWeightInt8 = weightLow; + } else { + std::shared_ptr srcWeight(Tensor::create({weightLength}, (void*)quanCommon->weight.get())); + mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8); + if(!mValid) { + return; } } - // Update int4 weight to mWeightInt8. - mResourceInt8->mWeightInt8 = weightLow; } + // Relu/Relu6 post parameters auto postPtr = getPostParameters(); - mResource->mReluThreshold.resize(2); - mResource->mReluThreshold[0] = postPtr[2]; - mResource->mReluThreshold[1] = postPtr[3]; + mResourceInt8->mReluThreshold.resize(2); + mResourceInt8->mReluThreshold[0] = postPtr[2]; + mResourceInt8->mReluThreshold[1] = postPtr[3]; if (gcore->bytes == 2) { - gcore->MNNFp32ToLowp(mResource->mReluThreshold.data(), reinterpret_cast(mResource->mReluThreshold.data()), 2); + gcore->MNNFp32ToLowp(mResourceInt8->mReluThreshold.data(), reinterpret_cast(mResourceInt8->mReluThreshold.data()), 2); } - if (mCommon->relu()) { - mResource->mReluThreshold[0] = 0.f; +} + +DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res) : mDynamicQuantExe(false), ConvInt8TiledExecutor(backend, op, res) { + std::shared_ptr weightOrigin = mResourceInt8->mWeightInt8; + auto convOp = op->main_as_Convolution2D(); + mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8); + if(!mValid) { + return; } - if (mCommon->relu6()) { - mResource->mReluThreshold[0] = 0.f; - mResource->mReluThreshold[1] = 6.f; + // offline quant: choose int8 gemm kernel + auto core = static_cast(backend)->int8Functions(); + mGemmKernel = core->Int8GemmKernel; +#ifdef MNN_USE_SSE + int actBits = convOp->symmetricQuan()->nbits(); + if (actBits <= 7) { + mGemmKernel = core->Int8GemmKernelFast; } +#else + if(convOp->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){ + mGemmKernel = core->Int8GemmKernelFast; + } +#endif + CPUConvolution::makeResourceNew(backend, convOp, mResourceInt8); } -DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, bool dynamicQuantExe, const DenseConvInt8TiledExecutor& exe) - : ConvInt8TiledExecutor(backend, convOp, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel), mResource(exe.mResource), mDynamicQuantExe(dynamicQuantExe) { +DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe) + : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel), mDynamicQuantExe(exe.mDynamicQuantExe) { } DenseConvInt8TiledExecutor::~DenseConvInt8TiledExecutor() { @@ -315,7 +408,7 @@ bool DenseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** if (nullptr == dst) { return true; } - auto exe = new DenseConvInt8TiledExecutor(bn, op->main_as_Convolution2D(), mDynamicQuantExe, *this); + auto exe = new DenseConvInt8TiledExecutor(bn, op, *this); if (!exe->valid()) { return false; } @@ -342,7 +435,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); if (mDynamicQuantExe == false) { - mMutableResource.updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0])); + mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0])); CPUConvolution::onResize(inputs, outputs); ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core); mBlockNum = 1; @@ -350,18 +443,18 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input CPUConvolution::onResize(inputs, outputs); // Gemm Kernel mGemmKernel = core->Int8GemmKernel; - if (mResource->mDequantize.bits == 4) { + if (mResourceInt8->mActBits == 4) { mGemmKernel = core->Int8GemmKernel_W4; } mQuantFunc = core->MNNFloat2Int8; if (gcore->bytes == 2 && gcore->pack == 8) { mGemmKernel = core->MNNGemmInt8AddBiasScale_Unit_FP16; - if (mResource->mDequantize.bits == 4) { + if (mResourceInt8->mActBits == 4) { mGemmKernel = core->MNNGemmInt8AddBiasScale_w4_Unit_FP16; } mQuantFunc = core->DynamicQuanInput_ARM82; mQuantAndReorderFunc = core->DynamicQuanInputAndReorder_ARM82; - + } // A axisSum kernel mSumByAxisLFunc = gcore->MNNSumByAxisLForMatmul_A; @@ -371,10 +464,10 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core); } int ocUp4 = ROUND_UP(outputs[0]->channel(), gcore->pack); - int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2); + int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2); mBlockNum = alphaSize / ocUp4; } - + // input scale buffer int batch = inputs[0]->batch(); // mTempIm2ColBuffer.reset(Tensor::createDevice({mThreadNums, DST_XUNIT * mIm2ColCount * mResourceInt8->mWeightInt8->length(1) * SRC_UNIT})); @@ -398,23 +491,30 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input tileLimit = ALIMIN(tileLimitByC, planeSize); auto ocPerThread = UP_DIV(outC4, threads); auto threadNeed = UP_DIV(outC4, ocPerThread); + int totalWork = outC4; + int part = 1; if (UNIT > gcore->pack) { // AVX512:UNIT=64,pack=16 MNN_ASSERT(UNIT % gcore->pack == 0); int ocDivUnit = UP_DIV(outC4 * gcore->pack, UNIT); ocPerThread = UP_DIV(ocDivUnit, threads); threadNeed = UP_DIV(ocDivUnit, ocPerThread); + totalWork = ocDivUnit; + part = UNIT / gcore->pack; } mThreadNums = ALIMIN(threads, threadNeed); mSplitByOc = true; mDivides.resize(threads+1); mDivides[0] = 0; - static_cast(backend()->getRuntime())->computeDivideSizes(outC4, mDivides.data() + 1); + static_cast(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1); + for (int i = 0; i < mDivides.size(); ++i) { + mDivides[i] *= part; + } } mIm2ColCount = UP_DIV(tileLimit, DST_XUNIT); auto DynamicDestUnit = DST_XUNIT * mIm2ColCount; mTileCount = UP_DIV(planeSize, DynamicDestUnit); - + if (threads < planeSize) { mThreadNums = ALIMIN(threads, mTileCount); mDivides.resize(threads+1); @@ -422,14 +522,16 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input static_cast(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1); } int ocUp4 = ROUND_UP(outC, gcore->pack); - int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2); + // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2); + int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2); auto bufferAlloc = static_cast(backend())->getBufferAllocator(); auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); mBlitInfoStride = blitInfoSize.second; mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); - mTempIm2ColBuffer.reset(Tensor::createDevice({mThreadNums, DST_XUNIT * mIm2ColCount * mResourceInt8->mWeightInt8->length(1) * SRC_UNIT})); - mTempSrcSum.resize(mThreadNums * mBlockNum * DST_XUNIT * mIm2ColCount * 4); // Use 4 bytes to save kernel sum. + auto icDiv4KernelCount = mIm2ColParamter.kernelCountUnit; + mTempIm2ColBuffer.reset(Tensor::createDevice({threads, DST_XUNIT * mIm2ColCount * icDiv4KernelCount * SRC_UNIT})); + mTempSrcSum.resize(threads * mBlockNum * DST_XUNIT * mIm2ColCount * 4); // Use 4 bytes to save kernel sum. success &= backend()->onAcquireBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC); if (!success || mBlitInfo.invalid()) { @@ -442,7 +544,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input return NO_ERROR; } - + int inC = inputs[0]->channel(); // set im2col tensor info mQuantInput.reset((Tensor::createDevice({batch, mIm2ColParamter.ih, mIm2ColParamter.iw, ROUND_UP(inC, gcore->pack)}))); @@ -451,12 +553,12 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input // set compute buffer mDynamicBias.reset(Tensor::createDevice({ocUp4 * 4})); mScaleFuse.reset(Tensor::createDevice({alphaSize * 4})); - + success &= backend()->onAcquireBuffer(mQuantInput.get(), Backend::DYNAMIC); success &= backend()->onAcquireBuffer(mDynamicBias.get(), Backend::DYNAMIC); success &= backend()->onAcquireBuffer(mTempMaxMinValueBuffer.get(), Backend::DYNAMIC); success &= backend()->onAcquireBuffer(mScaleFuse.get(), Backend::DYNAMIC); - + if (mUseBatchQuan) { int infobytes = 4; // use float32 to save dequant scale and quant scale. int size = mThreadNums * batch * gcore->bytes + 2 * batch * infobytes; @@ -502,7 +604,8 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu const auto col_buffer_unit_size = kernelCountUnitDouble * DST_XUNIT * SRC_UNIT * sizeof(int8_t); const auto col_buffer_size = col_buffer_unit_size * mIm2ColCount; const int dstBytes = static_cast(backend())->getBytes(backend(), output); - const int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2); + // const int alphaSize = mResource->mDequantize.mScaleBias->size() / (4 * 2); + const int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2); const int blockL = kernelCountUnitDouble / mBlockNum; // source depthQuad for each block. float weightBytes = 1.f; int weight_step_Y = weightBytes * (UNIT__ * SRC_UNIT); @@ -512,15 +615,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu auto im2colPtr = mTempIm2ColBuffer->host(); const auto weightDataPtr = mResourceInt8->mWeightInt8->host(); auto srcKernelSumPtr = mTempSrcSum.data(); - auto weightDequantBias = mResource->mDequantize.mScaleBias->host() + alphaSize * 4; + auto weightDequantBias = mResourceInt8->mOriginScale->host() + alphaSize * 4; auto outputDataPtr = output->host(); - auto biasPtr = mMutableResource.mBiasFloat->host(); - auto scalePtr = mMutableResource.mScaleFloat->host(); + auto biasPtr = mMutableResource->mBiasFloat->host(); + auto scalePtr = mMutableResource->mScaleFloat->host(); - auto inputZeroPoint = mMutableResource.mInputZeroPoint; + auto inputZeroPoint = mMutableResource->mInputZeroPoint; auto inputScalePtr = mInputDeqScales->host(); - (reinterpret_cast(inputScalePtr))[0] = mMutableResource.mInputScale; + (reinterpret_cast(inputScalePtr))[0] = mMutableResource->mInputScale; auto SingleDynamicQuant = [&] () { const auto floatptr = input->host(); @@ -583,16 +686,16 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu } /* bias float */ - #ifdef MNN_USE_SSE + #ifdef MNN_USE_SSE int offset = 128; #else int offset = 0; #endif - auto biasfp32 = mMutableResource.mResource->mOriginBias->host(); - auto weightDequantScale = mResource->mDequantize.mScaleBias->host(); + auto biasfp32 = mMutableResource->mResource->mOriginBias->host(); + auto weightDequantScale = mResourceInt8->mOriginScale->host(); float zerofp32 = (zeropoint + offset) * dequantscale; - gcore->MNNDynamicUpdateConvBiasScale(mDynamicBias->host(), mScaleFuse->host(), biasfp32, weightDequantScale, &dequantscale, mResource->mWeightKernelSum->host(), &zerofp32, UP_DIV(output->channel(), 4), alphaSize); + gcore->MNNDynamicUpdateConvBiasScale(mDynamicBias->host(), mScaleFuse->host(), biasfp32, weightDequantScale, &dequantscale, mResourceInt8->mWeightKernelSum->host(), &zerofp32, UP_DIV(output->channel(), 4), alphaSize); // Move step for A and B for each block computing inputZeroPoint = zeropoint; @@ -643,12 +746,12 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu gcore->MNNDynamicQuant(inputData, int8ptr, scale_ptr, workCount, batch, PackUnit); } MNN_CONCURRENCY_END(); - + inputZeroPoint = 0; inputScalePtr = (uint8_t*)dequantPtr; inputDataPtr = mQuantInput->host(); - biasPtr = mMutableResource.mResource->mOriginBias->host(); - scalePtr = mResource->mDequantize.mScaleBias->host(); + biasPtr = mMutableResource->mResource->mOriginBias->host(); + scalePtr = mResourceInt8->mOriginScale->host(); }; ssize_t oneScale = 1; if (mUseBatchQuan) { @@ -659,12 +762,13 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu } else { // offline quant. } - - if (mResource->mDequantize.bits == 4) { + + + if (mResourceInt8->mActBits == 4) { weightBytes = 0.5; weight_step_Y *= 0.5; } - + SumByAxisParams sumParams; sumParams.oneScale = oneScale; sumParams.SRC_UNIT = SRC_UNIT; @@ -672,13 +776,13 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu sumParams.DST_XUNIT = DST_XUNIT; sumParams.col_buffer_unit_size = col_buffer_unit_size; sumParams.kernelCountUnitDouble = kernelCountUnitDouble; - + auto ThreadFunction = [&](int tId, int eStartIndex, int eEndIndex, int estep, int ocIndex) { auto ocDivThread = ocDiv4; if (mSplitByOc) { // Thread split by OC ocDivThread = ALIMIN(mDivides[tId + 1] - mDivides[tId], ocDiv4 - mDivides[tId]); } - float* reluPtr = mResource->mReluThreshold.data(); + float* reluPtr = mResourceInt8->mReluThreshold.data(); uint8_t* extraScale = nullptr; // input scale for batch dynamic quant. QuanPostTreatParameters quanParam; quanParam.blockNum = mBlockNum; @@ -686,17 +790,17 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu extraScale = inputScalePtr; } #ifdef MNN_USE_SSE - quanParam.extraBias = mResource->mWeightKernelSum->host() + ocIndex; + quanParam.extraBias = mResourceInt8->mWeightKernelSum->host() + ocIndex; #endif if (dstBytes != 1) { quanParam.useInt8 = 0; quanParam.fp32minmax = reluPtr; } else { - quanParam.maxValue = mMutableResource.mClampMax; + quanParam.maxValue = mMutableResource->mClampMax; if (mResourceInt8->mRelu) { - quanParam.minValue = mMutableResource.mOutputZeroPoint; + quanParam.minValue = mMutableResource->mOutputZeroPoint; } else { - quanParam.minValue = mMutableResource.mClampMin; + quanParam.minValue = mMutableResource->mClampMin; } } auto outputTid = outputDataPtr + ocIndex * plane * dstBytes; @@ -752,6 +856,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu int step = ALIMIN(DST_XUNIT, realDstCount); quanParam.srcKernelSum = ptrX; quanParam.extraScale = extraScale != nullptr ? (float*)ptrExtraScale : nullptr; + // printf("step=%d, ocDivThread=%d\n", step, ocDivThread); mGemmKernel(outputInTilePtr, colAddrTemp, weightPtrTid, kernelCountUnitDouble, dstZStep * dstBytes, ocDivThread, &quanParam, step); ptrX += step; realDstCount-=step; @@ -787,20 +892,21 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu } } }; - + const int threads = static_cast(backend())->threadNumber(); if (!mSplitByOc) { - MNN_CONCURRENCY_BEGIN(tId, mThreadNums) { + MNN_CONCURRENCY_BEGIN(tId, threads) { ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0); } MNN_CONCURRENCY_END(); } else { - MNN_CONCURRENCY_BEGIN(tId, mThreadNums) { + MNN_CONCURRENCY_BEGIN(tId, threads) { int ocIndex = PackUnit * mDivides[tId]; - ThreadFunction((int)tId, 0, mTileCount,1, ocIndex); + if (ocIndex < ocUp4) { + ThreadFunction((int)tId, 0, mTileCount,1, ocIndex); + } } MNN_CONCURRENCY_END(); } - return NO_ERROR; } diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp index d4524837c..c5fc5f4d3 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp @@ -18,7 +18,8 @@ namespace MNN { class ConvInt8TiledExecutor : public CPUConvolution { public: // given weight+bias+scale, do post process - ConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res); + ConvInt8TiledExecutor(Backend* backend, const Op* op); + ConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res); virtual ~ConvInt8TiledExecutor(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; @@ -31,8 +32,7 @@ class ConvInt8TiledExecutor : public CPUConvolution { int mThreadNums; std::shared_ptr mTempIm2ColBuffer; std::shared_ptr mResourceInt8; - // std::shared_ptr mResource; - CPUConvolution::MutableResourceInt8 mMutableResource; + std::shared_ptr mMutableResource; MemChunk mBlitInfo; std::pair mBlitInfoStride; int mIm2ColCount; @@ -50,14 +50,15 @@ class ConvInt8TiledExecutor : public CPUConvolution { class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor { public: // given weight+bias+scale, do post process - DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res, bool dynamicQuantExe); + DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res); // ptq + DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr quanCommon); // dynamic quant virtual ~DenseConvInt8TiledExecutor(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) override; private: - DenseConvInt8TiledExecutor(Backend* backend, const Convolution2D* common, bool dynamicQuantExe, const DenseConvInt8TiledExecutor& exe); + DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe); decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel; std::function mQuantFunc; @@ -69,7 +70,6 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor { std::shared_ptr mBatchQuantInfo; std::shared_ptr mInputDeqScales; std::shared_ptr mTempMaxMinValueBuffer; - std::shared_ptr mResource; std::vector mTempSrcSum; std::vector mDivides; diff --git a/source/backend/cpu/compute/ConvInt8Winograd.cpp b/source/backend/cpu/compute/ConvInt8Winograd.cpp index 2d0a4b5f2..433b88812 100644 --- a/source/backend/cpu/compute/ConvInt8Winograd.cpp +++ b/source/backend/cpu/compute/ConvInt8Winograd.cpp @@ -23,17 +23,20 @@ namespace MNN { std::shared_ptr ConvInt8Winograd::makeWinoResource(const int8_t* originWeight, std::shared_ptr scaleFloat, const int32_t* attr, Backend* backend, int oc, int ic, int kernelY, int kernelX) { auto core = static_cast(backend)->int8Functions(); + auto gcore = static_cast(backend)->functions(); int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - int oc4 = UP_DIV(oc, UNIT), ic4 = UP_DIV(ic, SRC_UNIT); + int pack = gcore->pack; + int ocDivUnit = UP_DIV(oc, UNIT), ic4 = UP_DIV(ic, SRC_UNIT); + int oc4 = UP_DIV(oc, pack); int kySize = attr[2], kxSize = attr[3], unitY = attr[4], unitX = attr[5]; attr += 6; int alphaY = kySize + unitY - 1, alphaX = kxSize + unitX - 1, alpha2 = alphaY * alphaX; std::shared_ptr weight, offsets, scales, inputScales; - weight.reset(Tensor::createDevice({alpha2, oc4, ic4, UNIT, SRC_UNIT})); - offsets.reset(Tensor::createDevice({alpha2, oc4, UNIT})); - scales.reset(Tensor::createDevice({alpha2, oc4 * UNIT})); - inputScales.reset(Tensor::createDevice({alpha2, UNIT})); + weight.reset(Tensor::createDevice({alpha2, ocDivUnit, ic4, UNIT, SRC_UNIT})); + offsets.reset(Tensor::createDevice({alpha2, oc4, pack})); + scales.reset(Tensor::createDevice({alpha2, oc4 * pack})); + inputScales.reset(Tensor::createDevice({alpha2, pack})); auto allocTensors = [=](std::vector> tensors) -> bool { bool success = true; @@ -54,8 +57,8 @@ std::shared_ptr ConvInt8Winograd::makeWinoResour auto weightScaleData = (const float*)attr; attr += alpha2 * oc; for (int i = 0; i < alpha2; ++i) { auto scale = 1.0f / inputScaleData[i]; - for (int u = 0; u < UNIT; ++u) { - inputScales->host()[i * UNIT + u] = scale; + for (int u = 0; u < pack; ++u) { + inputScales->host()[i * pack + u] = scale; } } @@ -86,7 +89,7 @@ std::shared_ptr ConvInt8Winograd::makeWinoResour float scale = weightScaleData[a * oc + oz]; for (int sz = 0; sz < ic; ++sz) { int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT; - int index = (((a * oc4 + oz4) * ic4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain; + int index = (((a * ocDivUnit + oz4) * ic4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain; float srcData = weightFloat->host()[(a * oc + oz) * ic + sz]; // -ffast-math may cause inexact input then wrong rounded result, add eps to avoid this float eps = ((srcData/scale) > 0 ? 1 : -1) * 1e-6; @@ -97,8 +100,9 @@ std::shared_ptr ConvInt8Winograd::makeWinoResour offset += quanData * (-128); #endif } - offsets->host()[a * oc4 * UNIT + oz] = offset * scale * inputScaleData[a]; - scales->host()[a * oc4 * UNIT + oz] = scale * inputScaleData[a]; + + offsets->host()[a * oc4 * pack + oz] = offset * scale * inputScaleData[a]; + scales->host()[a * oc4 * pack + oz] = scale * inputScaleData[a]; } } backend->onReleaseBuffer(originWeightFloat.get(), Backend::STATIC); @@ -184,6 +188,7 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector &inputs, const int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); UNIT = gcore->pack; + int pack = gcore->pack; auto input = mInputFloat.get(), output = outputs[0]; int batch = input->batch(), ic = input->channel(), oc = output->channel(); @@ -197,7 +202,7 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector &inputs, const } for (auto& unit : mUnits) { int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0); - auto srcChunk = TensorUtils::getDescribeOrigin(input)->mem->chunk() + (sy * iw + sx) * UNIT; + auto srcChunk = TensorUtils::getDescribeOrigin(input)->mem->chunk() + (sy * iw + sx) * pack; unit.input.reset(Tensor::createDevice({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4)); TensorUtils::getDescribeOrigin(unit.input.get())->mem = (new CPUMemObj(nullptr, srcChunk, 0)); for (int i = 0; i < input->dimensions(); ++i) { @@ -223,14 +228,14 @@ static void mergeAddBiasScaleQuantize(const std::vector& inputs, Tensor auto coreInt8 = cpuBn->int8Functions(); int UNIT, SRC_UNIT, DST_XUNIT; coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - UNIT = core->pack; + int pack = core->pack; - int countC4 = UP_DIV(output->channel(), UNIT), plane = output->height() * output->width() * output->batch(); + int countC4 = UP_DIV(output->channel(), pack), plane = output->height() * output->width() * output->batch(); auto mergeFloat = inputs[0]->host(); for (int i = 1; i < inputs.size(); ++i) { core->MNNMatrixAdd(mergeFloat, mergeFloat, inputs[i]->host(), plane * countC4, 0, 0, 0, 1); } - std::vector fakeScale(countC4 * UNIT, 1); + std::vector fakeScale(countC4 * pack, 1); core->MNNScaleAndAddBias(mergeFloat, mergeFloat, quanParam->biasFloat, fakeScale.data(), plane, countC4); coreInt8->MNNFloat2Int8(mergeFloat, output->host(), plane * countC4, quanParam->scale, quanParam->minValue, quanParam->maxValue, zeroPoint); } @@ -282,7 +287,8 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector &inputs, const auto gcore = bn->functions(); int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - UNIT = gcore->pack; + // UNIT = gcore->pack; + int pack = gcore->pack; // scale, zero, min, max auto inputQuant = TensorUtils::getQuantInfo(inputs[0]); auto outputQuant = TensorUtils::getQuantInfo(outputs[0]); @@ -299,9 +305,9 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector &inputs, const }; } - std::vector scale(UNIT, inputQuant[0]); + std::vector scale(pack, inputQuant[0]); int size = bn->getTensorSize(mInputFloat.get()); - core->MNNInt8ScaleToFloat(mInputFloat->host(), inputs[0]->host(), scale.data(), size / UNIT, inputQuant[1]); + core->MNNInt8ScaleToFloat(mInputFloat->host(), inputs[0]->host(), scale.data(), size / pack, inputQuant[1]); std::vector tmp_outputs; for (auto& unit : mUnits) { unit.input->buffer().host = TensorUtils::getDescribeOrigin(unit.input.get())->mem->chunk().ptr(); @@ -312,7 +318,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector &inputs, const tmp_outputs.push_back(unit.output.get()); } QuanPostTreatParameters quanParam; - scale.assign(UNIT, 1.0 / outputQuant[0]); + scale.assign(pack, 1.0 / outputQuant[0]); quanParam.scale = scale.data(); // For winograd Int8, will not treat origin bias to int32, use float directly quanParam.biasFloat = mResource->mOriginBias->host(); @@ -333,14 +339,14 @@ ConvInt8Winograd::WinoExecution::WinoExecution(std::shared_ptr res int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); - UNIT = gcore->pack; + int pack = gcore->pack; int threadNumber = ((CPUBackend *)backend())->threadNumber(); int alphaY = mUnitY + mKernelY - 1, alphaX = mUnitX + mKernelX - 1, alpha2 = alphaY * alphaX; - int ic4 = UP_DIV(inputCount, SRC_UNIT), oc4 = UP_DIV(outputCount, UNIT); + int ic4 = UP_DIV(inputCount, SRC_UNIT), oc4 = UP_DIV(outputCount, pack); mTempInputBuffer.reset(Tensor::createDevice({threadNumber, alpha2, ic4, DST_XUNIT * SRC_UNIT})); - mTempOutputBuffer.reset(Tensor::createDevice({threadNumber, alpha2, oc4, DST_XUNIT * UNIT})); - int midSize = alpha2 * DST_XUNIT * ALIMAX(ROUND_UP(inputCount, UNIT), oc4 * UNIT); + mTempOutputBuffer.reset(Tensor::createDevice({threadNumber, alpha2, oc4, DST_XUNIT * pack})); + int midSize = alpha2 * DST_XUNIT * ALIMAX(ROUND_UP(inputCount, pack), oc4 * pack); mTransformMidBuffer.reset(Tensor::createDevice({threadNumber, 3, midSize})); } ConvInt8Winograd::WinoExecution::WinoExecution(Backend* bn, const WinoExecution& exe) @@ -374,6 +380,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector int UNIT, SRC_UNIT, DST_XUNIT; coreInt8->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); UNIT = core->pack; + int pack = core->pack; auto gemmFunc = coreInt8->Int8GemmKernel; CoreFunctions::WinoUnrollTransFunc srcTransXFunc = nullptr, srcTransYFunc = nullptr; @@ -395,8 +402,8 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector int ow = output->width(), oh = output->height(); int iw = input->width(), ih = input->height(); - int ic = input->channel(), ic_4 = UP_DIV(ic, UNIT); - int dc_4 = UP_DIV(output->channel(), UNIT); + int ic = input->channel(), ic_4 = UP_DIV(ic, pack); + int dc_4 = UP_DIV(output->channel(), pack); int padY = mPadY, padX = mPadX; auto wUnit = UP_DIV(ow, mUnitX), hUnit = UP_DIV(oh, mUnitY); @@ -418,9 +425,9 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector for (int hbIndex=oybBegin; hbIndex <= oybEnd; ++hbIndex) { auto hIndex = hbIndex % hUnit; auto bIndex = hbIndex / hUnit; - auto bOffset = iw * ih * UNIT * bIndex; + auto bOffset = iw * ih * pack * bIndex; auto srcBatch = srcOrigin + bOffset; - int dstZStep = DST_XUNIT * UNIT, unitStep = dstZStep * ic_4; + int dstZStep = DST_XUNIT * pack, unitStep = dstZStep * ic_4; int step = std::min(wUnit - oxBegin, remain); int srcY = hIndex * mUnitY - padY; int ey = ALIMIN(srcY + alphaY, ih) - srcY; @@ -447,38 +454,38 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector int srcX = wIndex * mUnitX - padX; int sx = ALIMAX(0, srcX) - srcX; int ex = ALIMIN(srcX + alphaX, iw) - srcX; - auto dst_x = dstOrigin + si * UNIT; + auto dst_x = dstOrigin + si * pack; - int sourceZStep = iw * ih * UNIT * batch, sourceYStep = iw * UNIT; - auto srcStart = srcBatch + srcY * sourceYStep + srcX * UNIT; + int sourceZStep = iw * ih * pack * batch, sourceYStep = iw * pack; + auto srcStart = srcBatch + srcY * sourceYStep + srcX * pack; // when input window exceed limit (so need pad value), copy from src to midbuffer0 if (ex - sx != alphaX || ey - sy != alphaY) { - ::memset(midBuffer0, 0, alpha2 * ic_4 * UNIT * sizeof(float)); - int count = UNIT * (ex - sx); + ::memset(midBuffer0, 0, alpha2 * ic_4 * pack * sizeof(float)); + int count = pack * (ex - sx); for (int z = 0; count > 0 && z < ic_4; ++z) { for (int yy = sy; yy < ey; ++yy) { - auto dst_yy = midBuffer0 + ((z * alphaY + yy) * alphaX + sx) * UNIT; - auto src_yy = srcStart + z * sourceZStep + yy * sourceYStep + sx * UNIT; + auto dst_yy = midBuffer0 + ((z * alphaY + yy) * alphaX + sx) * pack; + auto src_yy = srcStart + z * sourceZStep + yy * sourceYStep + sx * pack; ::memcpy(dst_yy, src_yy, count * sizeof(float)); } } srcStart = midBuffer0; - sourceZStep = alpha2 * UNIT; - sourceYStep = alphaX * UNIT; + sourceZStep = alpha2 * pack; + sourceYStep = alphaX * pack; } for (int sz = 0; sz < ic_4; ++sz) { for (int s = 0; s < sStep; ++s) { - auto dst = dst_x + sz * dstZStep + s * UNIT; - auto src = srcStart + sz * sourceZStep + s * mUnitX * UNIT; - srcTransXFunc(src, midBuffer1, sourceYStep, alphaX * UNIT, UNIT, UNIT); - srcTransYFunc(midBuffer1, dst, UNIT, unitStep, alphaX * UNIT, alphaX * unitStep); + auto dst = dst_x + sz * dstZStep + s * pack; + auto src = srcStart + sz * sourceZStep + s * mUnitX * pack; + srcTransXFunc(src, midBuffer1, sourceYStep, alphaX * pack, pack, pack); + srcTransYFunc(midBuffer1, dst, pack, unitStep, alphaX * pack, alphaX * unitStep); } } si += sStep; } oxBegin = 0; remain -= step; - dstOrigin += UNIT * step; + dstOrigin += pack * step; } }; @@ -488,7 +495,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector auto weight = mWinoResource->weight->host(); std::vector xkernelSum(DST_XUNIT, 0); - std::vector wKernelSum(dc_4 * UNIT, 0); + std::vector wKernelSum(dc_4 * pack, 0); std::vector reluThred = {-std::numeric_limits().max(), std::numeric_limits().max()}; auto tFunction = [&](int tId) { @@ -505,20 +512,20 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector #ifndef MNN_WINO_TRANFORM_TEST_CLOSE src_trans_func(buffer2, srcOrigin, buffer0, xIndex, xC); #endif - ::memset(buffer1, 0, dc_4 * UNIT * sizeof(float)); + ::memset(buffer1, 0, dc_4 * pack * sizeof(float)); // Multi for (int i = 0; i < alpha2; ++i) { auto _srcInt8Ptr = _srcOrigin + i * mTempInputBuffer->stride(1); - auto scaleVec = mWinoResource->transInputScales->host() + i * UNIT; + auto scaleVec = mWinoResource->transInputScales->host() + i * pack; int zeroPoint = mWinoResource->transInputZeroPoints[i]; - coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * UNIT, (UNIT == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, zeroPoint); - if (UNIT != SRC_UNIT) { + coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * pack, (pack == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, zeroPoint); + if (pack != SRC_UNIT) { int areaOffset[] = {DST_XUNIT, DST_XUNIT}, byte = sizeof(float); - _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, UNIT / byte, SRC_UNIT / byte); + _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, pack / byte, SRC_UNIT / byte); } - auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * UNIT; + auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * pack; auto _weightInt8Ptr = weight + i * mWinoResource->weight->stride(0); QuanPostTreatParameters quanParam; quanParam.biasFloat = (mWinoResource->offsets->host() + i * mWinoResource->offsets->stride(0)); @@ -526,16 +533,16 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector quanParam.srcKernelSum = xkernelSum.data(); quanParam.weightQuanBias = wKernelSum.data(); quanParam.fp32minmax = reluThred.data(); - quanParam.scale = mWinoResource->scales->host() + i * dc_4 * UNIT; + quanParam.scale = mWinoResource->scales->host() + i * dc_4 * pack; quanParam.extraScale = nullptr; - gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * UNIT * sizeof(float), dc_4, &quanParam, xC); + gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, xC); } #ifndef MNN_WINO_TRANFORM_TEST_CLOSE { auto midBuffer0 = buffer0; auto midBuffer1 = (float*)((int8_t*)midBuffer0 + mTransformMidBuffer->stride(1)); - int srcZStep = xC * UNIT; - int unitStep = dc_4 * xC * UNIT; + int srcZStep = xC * pack; + int unitStep = dc_4 * xC * pack; int oybBegin = xIndex / wUnit; int oxBegin = xIndex % wUnit; int oybEnd = (xIndex + xC-1) / wUnit; @@ -565,32 +572,32 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector for (int si=0; si } oxBegin = 0; remain -= step; - dstS += UNIT * step; + dstS += pack * step; } } #endif diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp index 40a444696..738d85826 100644 --- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp +++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp @@ -26,8 +26,9 @@ namespace MNN { static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend* backend, - const Convolution2D* conv2d, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize, std::shared_ptr weightQuantInfo, bool supportSparse, bool lowMemory) { + const Op* op, const float* originWeight, size_t originWeightSize, const float* bias, size_t biasSize, std::shared_ptr weightQuantInfo, bool supportSparse, bool lowMemory) { auto cpuBackend = (CPUBackend*)backend; + auto conv2d = op->main_as_Convolution2D(); auto common = conv2d->common(); #ifdef MNN_USE_ONEDNN return OneDNN::createConvolution(common, backend, originWeight, originWeightSize, bias, biasSize); @@ -47,9 +48,10 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend if (lowMemory && nullptr != weightQuantInfo.get() && originWeightSize == 0) { if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) { - auto core = static_cast(backend)->functions(); - auto resourceInt8 = CPUConvolution::makeResourceInt8(backend, conv2d, core->pack); - return new DenseConvInt8TiledExecutor(backend, conv2d, resourceInt8, true); + // auto core = static_cast(backend)->functions(); + // auto resourceInt8 = CPUConvolution::makeResourceInt8(backend, op, core->pack); + // return new DenseConvInt8TiledExecutor(backend, op, resourceInt8, true); + return new DenseConvInt8TiledExecutor(backend, op, weightQuantInfo); } else { return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo); } @@ -107,7 +109,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector& inputs, c // The weight is storage as float sparse, but the backend don't support sparse compute, expand it forceFloat = true; } - quanCommon = ConvolutionCommon::load(conv2d, backend, forceFloat, lowMemory); + quanCommon = ConvolutionCommon::load(op, backend, forceFloat, lowMemory); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str()); return nullptr; @@ -143,7 +145,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector& inputs, c } MNN_ASSERT(group > 0); if (1 == group) { - return _createUnit(inputs[0], outputs[0], backend, conv2d, originWeight, originWeightSize, + return _createUnit(inputs[0], outputs[0], backend, op, originWeight, originWeightSize, originBias, originBiasSize, quanCommon, supportSparse, lowMemory); } // TODO: Use Geometry to split @@ -157,7 +159,7 @@ Execution* ConvolutionFloatFactory::create(const std::vector& inputs, c emptyOutput->setLength(1, outputs[0]->channel() / group); for (int i = 0; i < group; ++i) { auto newConvolution = - _createUnit(emptyInput.get(), emptyOutput.get(), backend, conv2d, originWeight + groupWeightSize * i, + _createUnit(emptyInput.get(), emptyOutput.get(), backend, op, originWeight + groupWeightSize * i, groupWeightSize, conv2d->bias()->data() + groupOutputCount * i, groupOutputCount, quanCommon, supportSparse, lowMemory); subConvolution.push_back(std::shared_ptr(newConvolution)); } diff --git a/source/backend/cpu/compute/DeconvolutionWithStride.cpp b/source/backend/cpu/compute/DeconvolutionWithStride.cpp index 74b78d28d..732b6540d 100644 --- a/source/backend/cpu/compute/DeconvolutionWithStride.cpp +++ b/source/backend/cpu/compute/DeconvolutionWithStride.cpp @@ -177,7 +177,7 @@ DeconvolutionWithStride::DeconvolutionWithStride(const Tensor* input, const Op* int tempWeightSize = 0; int srcCount = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, b, conv2D, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, b, convOp, &tempWeight, &tempWeightSize); srcCount = tempWeightSize / kx / ky / outputCount; int sy = common->strideY(); @@ -270,7 +270,7 @@ void DeconvolutionWithStride::_extract(const Op* convOp) { int tempWeightSize = 0; int srcCount = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend(), convOp, &tempWeight, &tempWeightSize); srcCount = tempWeightSize / kx / ky / outputCount; std::shared_ptr weightWrap( diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp index 61dfb445a..918f47fa1 100644 --- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp @@ -29,36 +29,12 @@ void DenseConvolutionTiledExecutor::initWeight(float *dest, const float *source, } bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptr int8Info, std::shared_ptr resource, int hU, int hP, int lU, int lP, int outputCount, int srcChannel, int kernelSize, int bytes) { int weightLength = hU * lU * hP * lP; - resource->mWeight.reset(Tensor::createDevice(std::vector{hU, lU * lP, hP})); -// resource->mWeight.reset(Tensor::createDevice({weightLength})); - auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC); - if (!res) { - return false; - } resource->mDequantize.bits = 8; resource->lU = lU; resource->hU = hU; resource->lP = lP; resource->hP = hP; MNN_ASSERT(lP == 1); - // Reorder weight - - auto dstWInt8 = resource->mWeight->host(); - auto srcWInt8 = int8Info->weight.get(); - ::memset(dstWInt8, 0, resource->mWeight->usize()); - for (int y=0; yalpha.size(); int scaleSize = dequantCnt; // real size @@ -69,7 +45,7 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptrmDequantize.mScaleBias.reset(MNN::Tensor::createDevice({scaleSize * 2 * bytes})); - res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC); + auto res = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC); if (!res) { return false; } @@ -78,22 +54,68 @@ bool DenseConvolutionTiledExecutor::initQuantizeResource(std::shared_ptrmDequantize.bits = 4; - std::shared_ptr weightLow(Tensor::createDevice( - {weightLength})); - auto res = resource->backend->onAcquireBuffer(weightLow.get(), Backend::STATIC); + resource->mWeight.reset(Tensor::createDevice(std::vector{weightLength})); + auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC); if (!res) { return false; } - auto srcPtr = resource->mWeight->host(); - auto dstPtr = weightLow->host(); - for (int i=0; imWeight->host(); + auto srcWInt4 = int8Info->weight.get(); + if (kernelSize == 1 && srcChannel % 2 == 0 && hU * hP == outputCount) { + for (int i = 0; i < hU; i++) { + for (int j = 0; j < srcChannel/2; j++) { + for (int k = 0; k < hP/2; k++) { + uint8_t s0 = srcWInt4[((i * hP + (k * 2 + 0)) * srcChannel) / 2 + j]; + uint8_t s1 = srcWInt4[((i * hP + (k * 2 + 1)) * srcChannel) / 2 + j]; + uint8_t d0 = (s0 & 0xf0) | (s1 >> 4); + uint8_t d1 = (s0 << 4) | (s1 & 0x0f); + dstWInt4[(i * srcChannel + (j * 2 + 0)) * hP / 2 + k] = d0; + dstWInt4[(i * srcChannel + (j * 2 + 1)) * hP / 2 + k] = d1; + } + } + } + } else { + // [oc, ic, ks] -> [oc/hP, ks, ic, hP] + ::memset(dstWInt4, 0, resource->mWeight->usize()); + for (int y = 0; y < outputCount; ++y) { + int yo = y / hP; + int yi = y % hP; + for (int iz = 0; iz < srcChannel; ++iz) { + for (int k=0; k < kernelSize; ++k) { + int sx = y * srcChannel * kernelSize + iz * kernelSize + k; + int dx = yo * lP * hP * lU + (iz + k * srcChannel) * hP + yi; + uint8_t s = srcWInt4[sx/2]; + s = (sx % 2) ? (s & 0xf) : (s >> 4); + s = (dx % 2) ? s : (s << 4); + dstWInt4[dx/2] |= s; + } + } + } } originOffset = -8; - resource->mWeight = weightLow; + } else { + resource->mWeight.reset(Tensor::createDevice(std::vector{hU, lU * lP, hP})); + auto res = resource->backend->onAcquireBuffer(resource->mWeight.get(), Backend::STATIC); + if (!res) { + return false; + } + // Reorder weight for int8 + auto dstWInt8 = resource->mWeight->host(); + auto srcWInt8 = int8Info->weight.get(); + ::memset(dstWInt8, 0, resource->mWeight->usize()); + for (int y=0; ymDequantize.mScaleBias->host(); auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + scaleSize * bytes); @@ -180,6 +202,9 @@ DenseConvolutionTiledExecutor::DenseConvolutionTiledExecutor(const Convolution2D MNN_ASSERT(nullptr != int8Info.get()); originWeightSize = int8Info->weight.size(); } + if (int8Info && int8Info->canUseInt4) { + originWeightSize *= 2; + } // Don't use common->inputCount for old model common->inputCount is zero auto srcCount = (int)originWeightSize / outputCount / common->kernelX() / common->kernelY(); auto lSize = srcCount * common->kernelX() * common->kernelY(); diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp index bc5abc93b..a73afdba8 100644 --- a/source/backend/cpu/compute/GemmInt8Executor.cpp +++ b/source/backend/cpu/compute/GemmInt8Executor.cpp @@ -14,10 +14,10 @@ namespace MNN { -GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector bias): - CPUConvolution(conv2D->common(), bn), mResourceInt8(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){ +GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Op *op, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector bias) : + CPUConvolution(op->main_as_Convolution2D()->common(), bn), mResourceInt8(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){ mResource.reset(new Resource); - CPUConvolution::makeResource(bn, mResource, conv2D, mResourceInt8); + CPUConvolution::makeResource(bn, mResource, op, mResourceInt8); } GemmInt8Executor::~GemmInt8Executor() { @@ -39,8 +39,8 @@ ErrorCode GemmInt8Executor::onResize(const std::vector &inputs, const auto output = outputs[0]; auto core = static_cast(backend())->int8Functions(); - int UNIT___, SRC_UNIT, DST_XUNIT; - core->MNNGetGemmUnit(&UNIT___, &SRC_UNIT, &DST_XUNIT); + int UNIT__, SRC_UNIT, DST_XUNIT; + core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT); auto gcore = static_cast(backend())->functions(); auto pack = gcore->pack; @@ -81,19 +81,20 @@ ErrorCode GemmInt8Executor::onResize(const std::vector &inputs, const mIm2ColParamter.kernelY = 1; mIm2ColParamter.padX = 0; mIm2ColParamter.padY = 0; - mIm2ColParamter.kernelCountUnit = UP_DIV(input->channel(), SRC_UNIT); - if (SRC_UNIT > UNIT___ && UNIT___ == pack) { + if (SRC_UNIT > pack) { const auto srcCountUnit = UP_DIV(input->channel(), pack); + mIm2ColParamter.kernelCountUnit = UP_DIV(srcCountUnit, SRC_UNIT / pack); mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack; } else { const auto srcCountUnit = UP_DIV(input->channel(), SRC_UNIT); - mIm2ColParamter.ic = mIm2ColParamter.icDiv4 * pack; + mIm2ColParamter.kernelCountUnit = srcCountUnit; + mIm2ColParamter.ic = srcCountUnit * SRC_UNIT; } mTileCnt = UP_DIV(input->height() * input->width() * input->batch(), DST_XUNIT); const int threads = std::max(static_cast(backend())->threadNumber(), 1); mThreadNums = std::min(threads, mTileCnt); - + mInputCol.reset(Tensor::createDevice({mThreadNums, DST_XUNIT, mIm2ColParamter.kernelCountUnit * SRC_UNIT})); bool success = backend()->onAcquireBuffer(mInputCol.get(), Backend::DYNAMIC); if (!success) { @@ -137,7 +138,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const auto im2colPtr = mInputCol->host(); auto outputDataPtr = output->host(); - + auto bias_elesize = ocDiv4 * PackUnit; QuanPostTreatParameters quanParam; quanParam.scale = mScaleData.data(); @@ -156,7 +157,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const quanParam.weightQuanBias = mKernelSum.data(); quanParam.extraScale = nullptr; float dequantScale = mMutableResource.mResource->mInputScale; - + SumByAxisParams sumParams; sumParams.DST_XUNIT = DST_XUNIT; sumParams.SRC_UNIT = SRC_UNIT; @@ -210,7 +211,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector &inputs, const threadFunction((int)tId); } MNN_CONCURRENCY_END(); - + // MNN_PRINT("deconv int8 execute: cost time: %llu us\n", kernelTimer.durationInUs()); return NO_ERROR; } diff --git a/source/backend/cpu/compute/GemmInt8Executor.hpp b/source/backend/cpu/compute/GemmInt8Executor.hpp index 0c1345f03..a93f90435 100644 --- a/source/backend/cpu/compute/GemmInt8Executor.hpp +++ b/source/backend/cpu/compute/GemmInt8Executor.hpp @@ -14,7 +14,7 @@ namespace MNN { class GemmInt8Executor : public CPUConvolution { public: - GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Convolution2D *conv2D, decltype(CoreInt8Functions::Int8GemmKernel), + GemmInt8Executor(Backend* bn, std::shared_ptr resource, const Op *op, decltype(CoreInt8Functions::Int8GemmKernel), std::vector bias); virtual ~GemmInt8Executor(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; diff --git a/source/backend/cpu/compute/IdstConvolutionInt8.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp index 025ed8763..05a9df338 100644 --- a/source/backend/cpu/compute/IdstConvolutionInt8.cpp +++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp @@ -65,9 +65,9 @@ IdstConvolutionInt8::IdstConvolutionInt8(const Convolution2DCommon* convOp, Back auto kernelCount = kx * ky; auto srcCount = mSrcCount; std::vector shape; - if (SRC_UNIT > UNIT && UNIT == PackUnit) { + if (SRC_UNIT > PackUnit) { MNN_ASSERT(SRC_UNIT % UNIT == 0); - shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, UNIT) * kernelCount, SRC_UNIT / UNIT), UNIT, SRC_UNIT}; + shape = {UP_DIV(outputCount, UNIT), UP_DIV(UP_DIV(srcCount, PackUnit) * kernelCount, SRC_UNIT / PackUnit), UNIT, SRC_UNIT}; } else { shape = {UP_DIV(outputCount, UNIT), UP_DIV(srcCount, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT}; } diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp index 5c8fc0dca..ed5364c10 100644 --- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.cpp @@ -64,8 +64,9 @@ bool SparseConvInt8TiledExecutor::reorderWeight(Backend* b, const Convolution2DC return true; } -SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res) : ConvInt8TiledExecutor(backend, convOp, res) { +SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res) : ConvInt8TiledExecutor(backend, op, res) { + auto convOp = op->main_as_Convolution2D(); std::shared_ptr weightOrigin; weightOrigin.swap(mResourceInt8->mWeightInt8); const SparseCommon* sparseCommon = convOp->sparseParameter(); @@ -81,9 +82,9 @@ SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const } -SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, +SparseConvInt8TiledExecutor::SparseConvInt8TiledExecutor(Backend* backend, const Op* op, const SparseConvInt8TiledExecutor& exe) - : ConvInt8TiledExecutor(backend, convOp, exe.mResourceInt8), + : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8), mNNZMap(exe.mNNZMap), mDataOffsetMap(exe.mDataOffsetMap), mSparseBlockOC(exe.mSparseBlockOC), @@ -98,7 +99,7 @@ bool SparseConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** if (nullptr == dst) { return true; } - auto exe = new SparseConvInt8TiledExecutor(bn, op->main_as_Convolution2D(), *this); + auto exe = new SparseConvInt8TiledExecutor(bn, op, *this); if (!exe->valid()) { return false; } @@ -176,13 +177,13 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector& inp auto im2colPtr = mTempIm2ColBuffer->host(); auto outputDataPtr = output->host(); QuanPostTreatParameters quanParam; - quanParam.bias = mMutableResource.mBiasInt32->host(); - quanParam.scale = mMutableResource.mScaleFloat->host(); - quanParam.maxValue = mMutableResource.mClampMax; + quanParam.bias = mMutableResource->mBiasInt32->host(); + quanParam.scale = mMutableResource->mScaleFloat->host(); + quanParam.maxValue = mMutableResource->mClampMax; if (mResourceInt8->mRelu) { - quanParam.minValue = mMutableResource.mOutputZeroPoint; + quanParam.minValue = mMutableResource->mOutputZeroPoint; } else { - quanParam.minValue = mMutableResource.mClampMin; + quanParam.minValue = mMutableResource->mClampMin; } // MNN_PRINT("outputPlaneLen: %d, reduce l:%zu, minValue:%d, maxValue:%d, mTileCount:%d\n", outputPlaneLen, mSparseQuantParam.l, quanParam.minValue, quanParam.maxValue, mTileCount); const int col_buffer_size = mTempIm2ColBuffer->stride(0); @@ -207,9 +208,9 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector& inp bool needZero = res.second; if (needZero) { #ifdef MNN_USE_SSE - ::memset(colAddr, mMutableResource.mInputZeroPoint + 128, col_buffer_size); + ::memset(colAddr, mMutableResource->mInputZeroPoint + 128, col_buffer_size); #else - ::memset(colAddr, mMutableResource.mInputZeroPoint, col_buffer_size); + ::memset(colAddr, mMutableResource->mInputZeroPoint, col_buffer_size); #endif } info[0] = number; diff --git a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp index 9bcb7ee61..3f57e4454 100644 --- a/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp +++ b/source/backend/cpu/compute/SparseConvInt8TiledExecutor.hpp @@ -31,7 +31,7 @@ struct SparseQuantMatMulParam { class SparseConvInt8TiledExecutor : public ConvInt8TiledExecutor { public: // given weight+bias+scale, do post process - SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, std::shared_ptr res); + SparseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res); virtual ~SparseConvInt8TiledExecutor(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; @@ -50,7 +50,7 @@ class SparseConvInt8TiledExecutor : public ConvInt8TiledExecutor { } private: - SparseConvInt8TiledExecutor(Backend* backend, const Convolution2D* convOp, const SparseConvInt8TiledExecutor& exe); + SparseConvInt8TiledExecutor(Backend* backend, const Op* op, const SparseConvInt8TiledExecutor& exe); SparseQuantMatMulParam mSparseQuantParam; decltype(CoreInt8Functions::MNNPackedSparseQuantMatMulEpx1) mSparseQuantMatMulKernel; diff --git a/source/backend/cuda/execution/ConvCutlassExecution.cu b/source/backend/cuda/execution/ConvCutlassExecution.cu index 28f455a55..b8a960314 100644 --- a/source/backend/cuda/execution/ConvCutlassExecution.cu +++ b/source/backend/cuda/execution/ConvCutlassExecution.cu @@ -26,7 +26,7 @@ ConvCutlassExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize); auto oc = common->outputCount(); int l = weightSize / oc; @@ -195,7 +195,7 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector &inputs, con // Call from different function if(mFp32Infer){ return callCutlassGemmCudaCoreFloat32(inputs, outputs); - } + } mGpuComputeCap = runtime->compute_capability(); //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap); @@ -211,10 +211,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector &inputs, con // 0 -> Gemm, 1~N -> BatchGemm int32_t batchSize = 0; // [0]->A, [1]->B, [2]->bias, [3]->output - std::pair ptrOffset[4]; + std::pair ptrOffset[4]; int32_t batchOffset[4]; // [0]->alpha, [1]->beta, [2]->splitK - int32_t coefs[3]; + int32_t coefs[3]; // 0 -> RowColumn, 1 -> RowRow int32_t layout; bool epilogueVectorize @@ -246,7 +246,7 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector &inputs, con return NO_ERROR; } #endif - + return callCutlassGemmTensorCore(inputs, outputs); } diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.cu b/source/backend/cuda/execution/ConvDepthWiseExecution.cu index f4baac9d5..d53f67972 100755 --- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu +++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu @@ -40,7 +40,7 @@ __global__ void CONV_DW(const T* input, d_oc.divmod(index, tmp1, oz_2); d_ow.divmod(tmp1, tmp2, ox); d_oh.divmod(tmp2, ob, oy); - + int oz = oz_2 << 1; int ix = ox * sw - pw; int iy = oy * sh - ph; @@ -80,10 +80,10 @@ __global__ void CONV_DW(const T* input, } } -__global__ void CONV_DW_HALF2_OPT(const half2* input, - const half2* kernel, - const half2* bias, - half2 *output, +__global__ void CONV_DW_HALF2_OPT(const half2* input, + const half2* kernel, + const half2* bias, + half2 *output, const float maxV, const float minV, const int iw, @@ -111,7 +111,7 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input, d_oc.divmod(index, tmp1, oz_2); d_ow.divmod(tmp1, tmp2, ox); d_oh.divmod(tmp2, ob, oy); - + int oz = oz_2; int ix = ox * sw - pw; int iy = oy * sh - ph; @@ -144,10 +144,10 @@ __global__ void CONV_DW_HALF2_OPT(const half2* input, } } -__global__ void CONV_DW3x3_HALF2_OPT(const half2* input, - const half2* kernel, - const half2* bias, - half2 *output, +__global__ void CONV_DW3x3_HALF2_OPT(const half2* input, + const half2* kernel, + const half2* bias, + half2 *output, const float maxV, const float minV, const int iw, @@ -175,7 +175,7 @@ __global__ void CONV_DW3x3_HALF2_OPT(const half2* input, d_oc.divmod(index, tmp1, oz_2); d_ow.divmod(tmp1, tmp2, ox_2); d_oh.divmod(tmp2, ob, oy); - + int oz = oz_2; int ox = ox_2 << 1; int ix = ox - 1; @@ -348,7 +348,7 @@ __global__ void CONV_DW_MULTI_WIDTH4(const T* input, const half* kernel, const h float color3 = color0; // Parallel pipelining read and calculate - float src; + float src; float filter0, filter1, filter2, filter3; int src_offset = ((ob * ih + oy) * iw + (ox_4 << 2)) * c_p + oz; int filter_offset = 0 * c_p + oz; @@ -450,7 +450,7 @@ __global__ void CONV_DW_MULTI_WIDTH_CHANNEL(const float* input, const half* kern float2 src = ((float2 *)(input + src_offset + 0 * c_p))[0]; float2 filter = __half22float2(((half2 *)(kernel + filter_offset + 0 * c_p))[0]); - + color0.x += (src.x * filter.x); color0.y += (src.y * filter.y); @@ -589,9 +589,9 @@ ErrorCode ConvDepthWiseCompute(Backend* bn, return NO_ERROR; } - if(dw == 1 && dh == 1) { + if(dw == 1 && dh == 1) { if(sw == 1 && sh == 1 && pw == 0 && ph == 0 && kw > 3 && kw < 12 && kh == 1 && pw == 0 && ph == 0) { - + if(ow % 4 == 0) { DivModFast d_oc(c * PACK_NUMBER); DivModFast d_ow(ow/4); @@ -655,7 +655,7 @@ static std::shared_ptr _makeResource(const Op* const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize); auto tempWeightStorage = pool->alloc(depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)); auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second; cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float))); @@ -666,7 +666,7 @@ static std::shared_ptr _makeResource(const Op* auto regionStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion)); auto offsetGpuStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(offset)); auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; - + #ifdef ENABLE_CUDA_BF16 if(static_cast(bn)->getPrecision() == 3) { // [Oc, Kh*Kw] -> [Kh*Kw, Oc(p)] @@ -677,7 +677,7 @@ static std::shared_ptr _makeResource(const Op* WeightTransToBf16<<>>((const float*)tempWeight, (__nv_bfloat16*)res->mFilter, count,\ kernelY * kernelX, depth, d_ocp); checkKernelErrors; - } + } else #endif { @@ -717,15 +717,15 @@ static std::shared_ptr _makeResource(const Op* cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); #ifdef ENABLE_CUDA_BF16 - if(static_cast(bn)->getPrecision() == 3) + if(static_cast(bn)->getPrecision() == 3) { auto countBias = depthC * PACK_NUMBER; int block_num = runtime->blocks_num(countBias); int threads_num = runtime->threads_num(); BiasTransToBf16<<>>((const float*)tempBias, (__nv_bfloat16*)res->mBias, countBias, depth); checkKernelErrors; - } - else + } + else #endif { reg.size[0] = 1; diff --git a/source/backend/cuda/execution/ConvImplicitExecution.cu b/source/backend/cuda/execution/ConvImplicitExecution.cu index 58cda9dd6..d5353499e 100644 --- a/source/backend/cuda/execution/ConvImplicitExecution.cu +++ b/source/backend/cuda/execution/ConvImplicitExecution.cu @@ -82,7 +82,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize); mKernelInfo.kernelN = common->outputCount(); mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY; @@ -93,7 +93,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { int ci_pack = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER; int co_pack = UP_DIV(mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER; int khw = mKernelInfo.kernelX * mKernelInfo.kernelY; - + auto tempCacheBuffer = static_cast(backend)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); @@ -108,16 +108,16 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { DivModFast cipD(ci_pack); DivModFast khwD(khw); - + int block_num = runtime->blocks_num(ci_pack * co_pack * khw); int block_size = runtime->threads_num(); - + if(static_cast(backend)->getPrecision() == 1) { WeightPackFill_Implicit<<>>((const float*)cacheWeight, (float*)mFilter, khw, ci_pack * co_pack * khw, mKernelInfo.kernelC, mKernelInfo.kernelN, cipD, khwD); checkKernelErrors; } else { WeightPackFill_Implicit<<>>((const float*)cacheWeight, (half*)mFilter, khw, ci_pack * co_pack * khw, mKernelInfo.kernelC, mKernelInfo.kernelN, cipD, khwD); - checkKernelErrors; + checkKernelErrors; } static_cast(backend)->getStaticBufferPool()->free(tempCacheBuffer); } @@ -142,7 +142,7 @@ ConvImplicitExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER; biasTensor.reset(Tensor::createDevice({alignSize})); backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC); - + mBias = (void *)biasTensor.get()->buffer().device; cuda_check(cudaMemset(mBias, 0, alignSize*sizeof(float))); cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); @@ -159,7 +159,7 @@ ConvImplicitExecution::ConvImplicitExecution(Backend* backend, const MNN::Op* op #else Execution(backend), #endif - mOp(op) + mOp(op) { mResource = res; int precisonLevel = static_cast(backend)->getPrecision(); @@ -206,7 +206,7 @@ ErrorCode ConvImplicitExecution::onResize(const std::vector &inputs, c // Split K dimension into 1 partitions int split_k_slices = 1; int ci_pack = UP_DIV(input->channel(), PACK_NUMBER) * PACK_NUMBER; - int co_pack = UP_DIV(output->channel(), PACK_NUMBER) * PACK_NUMBER; + int co_pack = UP_DIV(output->channel(), PACK_NUMBER) * PACK_NUMBER; // Construct Conv2dProblemSize with user defined output size cutlass::conv::Conv2dProblemSize problem_size( input->batch(),//int N, @@ -253,7 +253,7 @@ ErrorCode ConvImplicitExecution::onResize(const std::vector &inputs, c mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - // Check the problem size is supported or not + // Check the problem size is supported or not cutlass::Status status = mImplicitConvOp.can_implement(arguments); cutlass_check(status); diff --git a/source/backend/cuda/execution/ConvWinogradExecution.cu b/source/backend/cuda/execution/ConvWinogradExecution.cu index 249720d10..fab1670db 100644 --- a/source/backend/cuda/execution/ConvWinogradExecution.cu +++ b/source/backend/cuda/execution/ConvWinogradExecution.cu @@ -14,7 +14,7 @@ namespace CUDA { #define UNIT 2 template -__global__ void WinoWeightReorder(const float* GgGt, +__global__ void WinoWeightReorder(const float* GgGt, T* GgGt_trans, const int block, const int co_pack, @@ -67,7 +67,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize); mKernelInfo.kernelN = common->outputCount(); mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY; @@ -110,7 +110,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) { } static_cast(backend)->getStaticBufferPool()->free(tempCacheBuffer); } - + // Copy Bias int biasSize = conv->bias()->size(); int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER; @@ -133,7 +133,7 @@ ConvWinogradExecution::ConvWinogradExecution(Backend* backend, const MNN::Op* op #else Execution(backend), #endif - mOp(op) + mOp(op) { mResource = res; int precisonLevel = static_cast(backend)->getPrecision(); @@ -197,10 +197,10 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c } auto bufferData = pool->alloc(BtdB_bytes * mBlock2 * mGemmInfo.elhPad[0] * mGemmInfo.elhPad[1]); mBtdB_Buffer = (void*)((uint8_t*)bufferData.first + bufferData.second); - + auto bufferMatmul = pool->alloc(bytes * mBlock2 * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]); mMatmul_Buffer = (void*)((uint8_t*)bufferMatmul.first + bufferMatmul.second); - + pool->free(bufferData); pool->free(bufferMatmul); @@ -231,7 +231,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - // Check the problem size is supported or not + // Check the problem size is supported or not cutlass::Status status = mGemmBatchedCudaF32F32Ln.can_implement(arguments); cutlass_check(status); @@ -258,24 +258,24 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C {alpha, beta}, // <- tuple of alpha and beta mBlock2}; // batch_count - + size_t workspace_size = GemmBatchedCuda_F16_F16_Linear_AlignCuda_Row_Column::get_workspace_size(arguments); - + if(workspace_size != 0) { workspaceTensor.reset(Tensor::createDevice({(int)workspace_size})); mResource->mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - - // Check the problem size is supported or not + + // Check the problem size is supported or not cutlass::Status status = mGemmBatchedCudaF16F16Ln.can_implement(arguments); cutlass_check(status); - + // Initialize CUTLASS kernel with arguments and workspace pointer status = mGemmBatchedCudaF16F16Ln.initialize(arguments, (uint8_t *)mWorkspace); cutlass_check(status); } else { - + typename GemmBatchedCuda_F16_F32_Linear_AlignCuda_Row_Column::Arguments arguments{problem_size, // <- problem size of matrix multiplication {(ElementInput_F16 *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A @@ -287,24 +287,24 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c (int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C {alpha, beta}, // <- tuple of alpha and beta mBlock2}; // batch_count - + size_t workspace_size = GemmBatchedCuda_F16_F32_Linear_AlignCuda_Row_Column::get_workspace_size(arguments); - + if(workspace_size != 0) { workspaceTensor.reset(Tensor::createDevice({(int)workspace_size})); mResource->mBackend->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - - // Check the problem size is supported or not + + // Check the problem size is supported or not cutlass::Status status = mGemmBatchedCudaF16F32Ln.can_implement(arguments); cutlass_check(status); - + // Initialize CUTLASS kernel with arguments and workspace pointer status = mGemmBatchedCudaF16F32Ln.initialize(arguments, (uint8_t *)mWorkspace); cutlass_check(status); } - + return NO_ERROR; } //MNN_PRINT("Winograd BatchGemm batch:%d, MNK:%d-%d-%d\n", mBlock2, mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]); @@ -316,10 +316,10 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c // 0 -> Gemm, 1~N -> BatchGemm int32_t batchSize = 0; // [0]->A, [1]->B, [2]->bias, [3]->output - std::pair ptrOffset[4]; + std::pair ptrOffset[4]; int32_t batchOffset[4]; // [0]->alpha, [1]->beta, [2]->splitK - int32_t coefs[3]; + int32_t coefs[3]; // 0 -> RowColumn, 1 -> RowRow int32_t layout; bool epilogueVectorize @@ -374,7 +374,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - // Check the problem size is supported or not + // Check the problem size is supported or not cutlass::Status status = mGemmBatchedF16F16LnSm75.can_implement(arguments); cutlass_check(status); @@ -404,7 +404,7 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector &inputs, c mWorkspace = (void *)workspaceTensor.get()->buffer().device; } - // Check the problem size is supported or not + // Check the problem size is supported or not cutlass::Status status = mGemmBatchedF16F32LnSm75.can_implement(arguments); cutlass_check(status); @@ -446,19 +446,19 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector &inputs, c int block_size = runtime->threads_num(); if(mFp32Infer) { WinoInputTrans<<>>((const float*)input_addr, (float*)mBtdB_Buffer, UNIT, - (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, + (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, total, lD, whD, wD, mPadX, mPadY, input->width(), input->height()); checkKernelErrors; } else if(mFp16Fp32MixInfer) { WinoInputTrans<<>>((const float*)input_addr, (half*)mBtdB_Buffer, UNIT, - (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, + (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, total, lD, whD, wD, mPadX, mPadY, input->width(), input->height()); checkKernelErrors; } else { WinoInputTrans<<>>((const half*)input_addr, (half*)mBtdB_Buffer, UNIT, - (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, + (UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack, total, lD, whD, wD, mPadX, mPadY, input->width(), input->height()); checkKernelErrors; @@ -487,7 +487,7 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector &inputs, c if(mIsTuned) { runGemmBatchedTensorCoreFloat16Infer(&mInfo); } - #endif + #endif if(!mIsTuned) { cutlass::Status status = mGemmBatchedF16F16LnSm75(); cutlass_check(status); @@ -500,14 +500,14 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector &inputs, c block_size = runtime->threads_num(); if (mFp16Fp32MixInfer || mFp32Infer) { WinoTrans2Output<<>>((const float*)mMatmul_Buffer, (const float*)bias_addr, (float*)output_addr, - UNIT, mBlock2, output->channel(), co_pack, + UNIT, mBlock2, output->channel(), co_pack, count, hD, whD, wD, output->width(), output->height(), mActivationType); checkKernelErrors; } else { WinoTrans2Output<<>>((const half*)mMatmul_Buffer, (const float*)bias_addr, (half*)output_addr, - UNIT, mBlock2, output->channel(), co_pack, + UNIT, mBlock2, output->channel(), co_pack, count, hD, whD, wD, output->width(), output->height(), mActivationType); diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.cu b/source/backend/cuda/execution/DeconvSingleInputExecution.cu index 1c9968c49..5bd945e06 100644 --- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu +++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu @@ -17,7 +17,7 @@ namespace CUDA { DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { mBackend = bn; auto runtime = static_cast(bn)->getCUDARuntime(); - + auto conv = op->main_as_Convolution2D(); auto common = conv->common(); mKernelInfo.kernelX = common->kernelX(); @@ -33,7 +33,7 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize); mKernelInfo.kernelN = common->outputCount(); mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY; @@ -49,7 +49,7 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { auto tempCacheBuffer = static_cast(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); - + // Reorder weight if(static_cast(bn)->getPrecision() == 1) { weightTensor.reset(Tensor::createDevice({param.elhPad[1] * param.elh[2]})); @@ -57,8 +57,8 @@ DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { weightTensor.reset(Tensor::createDevice({param.elhPad[1] * param.elh[2]})); } bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC); - mFilter = (void *)weightTensor.get()->buffer().device; - + mFilter = (void *)weightTensor.get()->buffer().device; + callWeightReorder((const void *)cacheWeight, (void *)mFilter, mKernelInfo, param.elhPad[1], (int)(static_cast(bn)->getPrecision() == 1), runtime); static_cast(bn)->getStaticBufferPool()->free(tempCacheBuffer); @@ -184,12 +184,12 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector &input mFilterAddr = mResource->mFilter; mBiasAddr = mResource->mBias; mBackendPtr = mResource->mBackend; - + // Call from different function if(mFp32Infer){ return callCutlassGemmCudaCoreFloat32(inputs, outputs); - } - + } + mGpuComputeCap = runtime->compute_capability(); //MNN_PRINT("Gpu smArch is sm_%d\n", mGpuComputeCap); if(mGpuComputeCap < 75) { @@ -214,7 +214,7 @@ ErrorCode DeconvSingleInputExecution::onExecute(const std::vector &inpu if(mFp16Fp32MixInfer) { size_t maxCount = mGemmInfo.elh[0] * mGemmInfo.elhPad[1]; callFloat2Half((const void*)input_addr, (void*)mInputBuffer, maxCount, runtime); - } + } // Run cutlass gemm forward runCutlassGemmFunc(); @@ -231,7 +231,7 @@ ErrorCode DeconvSingleInputExecution::onExecute(const std::vector &inpu class CUDADeconvolutionCreator : public CUDABackend::Creator { public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, + virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { if (nullptr != op->main_as_Convolution2D()->quanParameter()) { auto quan = op->main_as_Convolution2D()->quanParameter(); diff --git a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu index d2d1adaf2..c72651e15 100644 --- a/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu +++ b/source/backend/cuda/execution/bf16/ConvCutlassBf16Execution.cu @@ -26,7 +26,7 @@ ConvCutlassBf16Execution::Resource::Resource(Backend* bn, const MNN::Op* op) { const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &weightSize); auto oc = common->outputCount(); int l = weightSize / oc; diff --git a/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu b/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu index a9851bb53..406add181 100644 --- a/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu +++ b/source/backend/cuda/execution/weight_only_quant/ConvFpAIntBExecution.cu @@ -52,7 +52,7 @@ __global__ void CONV_FpAInt8B(const T* input, d_oc.divmod(index, tmp1, oz_2); d_ow.divmod(tmp1, tmp2, ox); d_oh.divmod(tmp2, ob, oy); - + int oz = oz_2; int ix = ox * sw - pw; int iy = oy * sh - ph; @@ -124,7 +124,7 @@ __global__ void CONV_FpAInt4B(const T* input, d_oc.divmod(index, tmp1, oz_2); d_ow.divmod(tmp1, tmp2, ox); d_oh.divmod(tmp2, ob, oy); - + int oz = oz_2; int ix = ox * sw - pw; int iy = oy * sh - ph; @@ -215,7 +215,7 @@ bool ConvFpAIntBExecution::isValid(const Convolution2D* conv, Backend* backend) return true; } - + ConvFpAIntBExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { mBackend = bn; auto runtime = static_cast(bn)->getCUDARuntime(); @@ -224,7 +224,7 @@ ConvFpAIntBExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { auto common = conv->common(); //weight host->device - std::shared_ptr quanCommon = ConvolutionCommon::load(conv, mBackend, false, true); + std::shared_ptr quanCommon = ConvolutionCommon::load(op, mBackend, false, true); auto oc = common->outputCount(); auto weightSize = quanCommon->weight.size(); @@ -481,7 +481,7 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector &inputs, co maxV = 6.0f; } - auto total = outputs[0]->batch() * oh * ow * ocp; + auto total = outputs[0]->batch() * oh * ow * ocp; auto& prop = runtime->prop(); int limitThreads = UP_DIV(total, prop.multiProcessorCount); int threadNum = ALIMIN(prop.maxThreadsPerBlock/2, limitThreads); @@ -503,9 +503,9 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector &inputs, co (const float*)mResource->mScale, (const float*)mResource->mOffset, (const float*)bias_addr, (float*)output_addr, maxV, minV, ic, icp, iw, ih, oc, ocp, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, d_oc, d_ow, d_oh); - checkKernelErrors; + checkKernelErrors; } - + return NO_ERROR; } @@ -520,9 +520,9 @@ ErrorCode ConvFpAIntBExecution::onExecute(const std::vector &inputs, co (const float*)mResource->mScale, (const float*)mResource->mOffset, (const float*)bias_addr, (float*)output_addr, maxV, minV, ic, icp, iw, ih, oc, ocp, ow, oh, kw, kh, dw, dh, sw, sh, pw, ph, total, d_oc, d_ow, d_oh); - checkKernelErrors; + checkKernelErrors; } - + return NO_ERROR; } diff --git a/source/backend/hiai/execution/NPUConvolution.cpp b/source/backend/hiai/execution/NPUConvolution.cpp index 40832368f..bb939dd23 100644 --- a/source/backend/hiai/execution/NPUConvolution.cpp +++ b/source/backend/hiai/execution/NPUConvolution.cpp @@ -49,7 +49,7 @@ ErrorCode NPUConvolution::onResize(const std::vector &inputs, const st std::shared_ptr quanCommon; if (nullptr != conv2D->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2D, backend(), true); + quanCommon = ConvolutionCommon::load(mOp, backend(), true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str()); } diff --git a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp index 059165ef0..e856c1702 100644 --- a/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp +++ b/source/backend/hiai/execution/NPUConvolutionDepthwise.cpp @@ -49,7 +49,7 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector &inputs, } std::shared_ptr quanCommon; if (nullptr != conv2D->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2D, backend(), true); + quanCommon = ConvolutionCommon::load(mOp, backend(), true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str()); } @@ -71,7 +71,7 @@ ErrorCode NPUConvolutionDepthwise::onResize(const std::vector &inputs, shared_ptr conv(new hiai::op::ConvolutionDepthwise(opName)); auto xOp = mNpuBackend->getInputOps(mOp); - + // om input weight const op mConst_w = hiai::op::Const(opName + "_w_const"); { diff --git a/source/backend/metal/MetalConvolution.mm b/source/backend/metal/MetalConvolution.mm index c1b6ee37c..d9dabb4bf 100755 --- a/source/backend/metal/MetalConvolution.mm +++ b/source/backend/metal/MetalConvolution.mm @@ -17,7 +17,7 @@ namespace MNN { MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) { - loadWeight(op->main_as_Convolution2D()); + loadWeight(op); } MetalConvolution::MetalConvolution(Backend *backend, const MNN::Op *op, std::shared_ptr weight, std::shared_ptr bias) : MetalConvolutionCommon(backend, op, bias) { mWeight = weight; @@ -47,7 +47,7 @@ auto oh = output->height(); auto oc_4 = UP_DIV(output->channel(), 4); auto ob = output->batch(); - + auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common()); auto padX = pads.first; auto padY = pads.second; @@ -77,7 +77,7 @@ mActivationType}; mConstBuffer = backend->getConstBuffer(sizeof(constants)); ::memcpy(mConstBuffer.contents, constants, sizeof(constants)); - + mParam = "_ic" + std::to_string(ic_4) + "oc" + std::to_string(oc_4) + "k" + std::to_string(mKernelX) + "x" + std::to_string(mKernelY) + "s" + std::to_string(mStrideX) + "x" + std::to_string(mStrideY) + @@ -119,7 +119,7 @@ int itemW[total_kernel] = {1, 1, 1, 2, 4}; int itemH[total_kernel] = {1, 1, 1, 1, 1}; int itemC[total_kernel] = {1, 4, 2, 1, 1}; - + int actual_kernel = 3; if(isS1D1P0) { actual_kernel = 4; @@ -137,7 +137,7 @@ } std::pair min_cost(INT_MAX, 0);//(min_time, min_index) - + NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; @@ -159,7 +159,7 @@ std::string name = [shaderName[knl_idx] UTF8String] + mParam; auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets: buffer_offset queue:backend->queue()]; - + if(min_cost.first > std::get<2>(ret)) { min_cost.first = std::get<2>(ret); min_cost.second = knl_idx; @@ -178,7 +178,7 @@ void MetalConvolution::onEncode(const std::vector &inputs, const std::vector &outputs, id encoder) { auto input = inputs[0]; auto output = outputs[0]; - + [encoder setComputePipelineState:mPipeline]; MetalBackend::setTensor(input, encoder, 0); MetalBackend::setTensor(output, encoder, 1); diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm index cfd9f47b6..33a3eb19d 100644 --- a/source/backend/metal/MetalConvolution1x1.mm +++ b/source/backend/metal/MetalConvolution1x1.mm @@ -25,10 +25,10 @@ MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) { auto conv2D = op->main_as_Convolution2D(); bool ldInt8Weight = false; - if (conv2D->quanParameter() && conv2D->quanParameter()->buffer()) { + if (conv2D->quanParameter() && (conv2D->external() || conv2D->quanParameter()->buffer())) { ldInt8Weight = true; } - loadWeight(op->main_as_Convolution2D(), ldInt8Weight); + loadWeight(op, ldInt8Weight); } MetalConvolution1x1::MetalConvolution1x1(Backend *backend, const MNN::Op *op, std::shared_ptr weight, std::shared_ptr bias, std::shared_ptr dequantScale, int dequantBits) : MetalConvolutionCommon(backend, op, bias) { @@ -78,7 +78,7 @@ int constants[] = {is, ic_4, ow, oh, os, oc_4, oc, ob, blockSize, mActivationType}; mConstBuffer = backend->getConstBuffer(sizeof(constants)); ::memcpy(mConstBuffer.contents, constants, sizeof(constants)); - + MetalRuntime* rt = (MetalRuntime *)backend->runtime(); if (mDequantScaleBias.get()) { NSUInteger gid_x = UP_DIV(ow * oh, 4); @@ -106,7 +106,7 @@ TensorUtils::getDescribe(bias)->extra.offset, TensorUtils::getDescribe(mDequantScaleBias.get())->extra.offset, 0}; - + MetalRuntime *rt = (MetalRuntime *)backend->runtime(); auto ret = [context getGridAndThreadgroup:mPipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset queue:backend->queue()]; mThreads = std::make_pair(std::get<0>(ret), std::get<1>(ret)); @@ -123,7 +123,7 @@ NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; - + const Tensor* weight = mWeight.get(); const Tensor* bias = mBias.get(); int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset, 0}; @@ -135,9 +135,9 @@ NSUInteger gid_x = UP_DIV(ow * oh, 4); NSUInteger gid_y = oc_4; NSUInteger gid_z = ob; - + mPipeline = [context pipelineWithName:@"conv1x1_g1z4" fp16:backend->useFp16InsteadFp32()]; - + NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; @@ -160,23 +160,23 @@ actual_kernel = 3; } std::pair min_cost(INT_MAX, 0);//(min_time, min_index) - + NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; const Tensor* weight = mWeight.get(); const Tensor* bias = mBias.get(); int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, 0, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset, 0}; - + for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) { id pipeline = [context pipelineWithName:shaderName[knl_idx] fp16:backend->useFp16InsteadFp32()]; NSUInteger gid_x = UP_DIV(ow, itemW[knl_idx]); NSUInteger gid_y = UP_DIV(oc, itemC[knl_idx]); NSUInteger gid_z = 1; - + std::string name = [shaderName[knl_idx] UTF8String]; auto ret = [context getGridAndThreadgroup:pipeline gid:MTLSizeMake(gid_x, gid_y, gid_z) loop:10 buffer:arr runtime:rt shaderName:name offsets:buffer_offset queue:backend->queue()]; - + if(min_cost.first > std::get<2>(ret)) { min_cost.first = std::get<2>(ret); min_cost.second = knl_idx; diff --git a/source/backend/metal/MetalConvolutionCommon.hpp b/source/backend/metal/MetalConvolutionCommon.hpp index 299551922..a391d65e2 100644 --- a/source/backend/metal/MetalConvolutionCommon.hpp +++ b/source/backend/metal/MetalConvolutionCommon.hpp @@ -22,7 +22,7 @@ class MetalConvolutionCommon : public MetalExecution { virtual ~MetalConvolutionCommon() = default; protected: - void loadWeight(const MNN::Convolution2D *conv, bool loadWeightInt8 = false); + void loadWeight(const MNN::Op *op, bool loadWeightInt8 = false); virtual std::shared_ptr weightTransform(int group, int oc, int ic, int kh, int kw, const float *src, bool int8Weight = false, bool int4Weight = false); diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm index 548aae2ef..318c138eb 100644 --- a/source/backend/metal/MetalConvolutionCommon.mm +++ b/source/backend/metal/MetalConvolutionCommon.mm @@ -146,15 +146,19 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src, } return dequantScale; } -void MetalConvolutionCommon::loadWeight(const MNN::Convolution2D *conv, bool loadWeightInt8) { +void MetalConvolutionCommon::loadWeight(const MNN::Op *op, bool loadWeightInt8) { + auto conv = op->main_as_Convolution2D(); std::shared_ptr qnt = NULL; if (loadWeightInt8) { - qnt = ConvolutionCommon::load(conv, backend(), false, true); + qnt = ConvolutionCommon::load(op, backend(), false, true); } else if (conv->quanParameter()) { - qnt = ConvolutionCommon::load(conv, backend(), true); + qnt = ConvolutionCommon::load(op, backend(), true); } // param auto size = qnt ? MAX(qnt->weight.size(), qnt->weightFloat.size()) : conv->weight()->size(); + if (loadWeightInt8 && qnt->canUseInt4) { + size *= 2; + } auto common = conv->common(); auto kw = common->kernelX(); auto kh = common->kernelY(); @@ -185,6 +189,59 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src, auto goc_4 = UP_DIV(goc, 4); auto gic_4 = UP_DIV(gic, 4); auto weight_len = group * ROUND_UP(goc_4, 4) * gic_4 * kw * kh * 16; + + if (int4Weight) { + weight_len = UP_DIV(weight_len, 2); + std::shared_ptr weightLow(MNN::Tensor::createDevice({weight_len})); + auto res = backend->onAcquireBuffer(weightLow.get(), Backend::STATIC); + if (!res) { + MNN_ERROR("Memory alloc error!\n"); + return nullptr; + } + auto srcPtr = (int8_t*)src; + auto buf = MetalBackend::getBuffer(weightLow.get()); + auto dstPtr = (uint8_t*)[buf.first contents] + buf.second; + auto oc_4 = UP_DIV(oc, 4); + auto ic_4 = UP_DIV(ic, 4); + if (group == 1 && kh == 1 && kw == 1) { + // fast int4 reorder + for (int i = 0; i < oc; i++) { + auto zo = i / 4, ro = i % 4; + for (int j = 0; j < ic; j++) { + auto zi = j / 4, ri = j % 4; + dstPtr[((zo * ic_4 + zi) * 16 + ro * 4 + ri) / 2] = srcPtr[(i * ic + j) / 2]; + } + } + } else { + // slow int4 reorder + int sx = 0; + auto goc_4 = UP_DIV(goc, 4); + auto gic_4 = UP_DIV(gic, 4); + ::memset(dstPtr, 0, weight_len); + for (int g = 0; g < group; g++) { + for (int o = 0; o < goc; o++) { + auto zo = o / 4, ro = o % 4; + for (int i = 0; i < gic; i++) { + auto zi = i / 4, ri = i % 4; + for (int h = 0; h < kh; h++) { + for (int w = 0; w < kw; w++) { + // to [g][o/4][i/4][h][w][16] + // from [g][o][i][h][w] + int dx = g * goc_4 * gic_4 * kh * kw * 16 + zo * gic_4 * kh * kw * 16 + ro * 4 + zi * kh * kw * 16 + ri + (h * kw + w) * 16; + uint8_t s = srcPtr[sx/2]; + s = (sx % 2) ? (s & 0xf) : (s >> 4); + s = (dx % 2) ? s : (s << 4); + dstPtr[dx/2] |= s; + sx++; + } + } + } + } + } + } + return weightLow; + } + std::shared_ptr t(MNN::Tensor::createDevice({weight_len})); if (int8Weight || int4Weight) { t.reset(MNN::Tensor::createDevice({weight_len})); @@ -195,33 +252,14 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src, } auto buffer = MetalBackend::getBuffer(t.get()); auto dst = (uint8_t*)[buffer.first contents] + buffer.second; - - if (int8Weight || int4Weight) { + if (int8Weight) { weightInBlock(group, oc, ic, kh, kw, (int8_t*)src, dst); } else if (backend->useFp16InsteadFp32()) { weightInBlock(group, oc, ic, kh, kw, src, dst); } else { weightInBlock(group, oc, ic, kh, kw, src, dst); } - if (int4Weight) { - weight_len = UP_DIV(weight_len, 2); - std::shared_ptr weightLow(MNN::Tensor::createDevice({weight_len})); - auto res = backend->onAcquireBuffer(weightLow.get(), Backend::STATIC); - if (!res) { - MNN_ERROR("Memory alloc error!\n"); - return nullptr; - } - auto srcPtr = (int8_t*)dst; - auto buf = MetalBackend::getBuffer(weightLow.get()); - auto dstPtr = (uint8_t*)[buf.first contents] + buf.second; - for (int i=0; i < weight_len; ++i) { - int s0 = srcPtr[2 * i + 0]; - int s1 = srcPtr[2 * i + 1]; - int d = (s0 + 8) * 16 + (s1 + 8); - dstPtr[i] = d; - } - return weightLow; - } + return t; } diff --git a/source/backend/metal/MetalConvolutionDepthwise.mm b/source/backend/metal/MetalConvolutionDepthwise.mm index 546896bb9..85b17c88f 100755 --- a/source/backend/metal/MetalConvolutionDepthwise.mm +++ b/source/backend/metal/MetalConvolutionDepthwise.mm @@ -15,7 +15,7 @@ namespace MNN { MetalConvolutionDepthwise::MetalConvolutionDepthwise(Backend *backend, const MNN::Op *op) : MetalConvolutionCommon(backend, op, nullptr) { - loadWeight(op->main_as_Convolution2D()); + loadWeight(op); } ErrorCode MetalConvolutionDepthwise::onResize(const std::vector &inputs, @@ -60,14 +60,14 @@ mConstBuffer = backend->getConstBuffer(sizeof(constants)); ::memcpy(mConstBuffer.contents, constants, sizeof(constants)); - + auto context = (__bridge MNNMetalContext *)backend->context(); mPipeline = [context pipelineWithName:@"conv_depthwise" fp16:backend->useFp16InsteadFp32()]; - + NSUInteger gid_x = ow; NSUInteger gid_y = oh; NSUInteger gid_z = oc_4*ob; - + NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), mConstBuffer, (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil]; diff --git a/source/backend/metal/MetalConvolutionWinograd.mm b/source/backend/metal/MetalConvolutionWinograd.mm index dcc321782..a7db2c4cd 100644 --- a/source/backend/metal/MetalConvolutionWinograd.mm +++ b/source/backend/metal/MetalConvolutionWinograd.mm @@ -44,7 +44,7 @@ auto conv = op->main_as_Convolution2D(); mSrcUnit = UNIT + conv->common()->kernelY() - 1; mDstUnit = UNIT; - loadWeight(conv); + loadWeight(op); } MetalConvolutionWinograd::MetalConvolutionWinograd(Backend *backend, const MNN::Op *op, std::shared_ptr weight, std::shared_ptr bias) : MetalConvolutionCommon(backend, op, bias) { auto conv = op->main_as_Convolution2D(); @@ -81,7 +81,7 @@ auto pads = ConvolutionCommon::convolutionPad(input, output, mOp->main_as_Convolution2D()->common()); auto padX = pads.first; auto padY = pads.second; - + // create const buffer struct TransformBuffer { int inputSize[4]; diff --git a/source/backend/metal/MetalDeconvolution.mm b/source/backend/metal/MetalDeconvolution.mm index 38921d690..4338d9e30 100755 --- a/source/backend/metal/MetalDeconvolution.mm +++ b/source/backend/metal/MetalDeconvolution.mm @@ -145,7 +145,7 @@ void weightForDeconv(std::shared_ptr t, bool depthwise, const Convo // forcy downgrade to float like what CPU does std::shared_ptr qnt = NULL; if (deconv->quanParameter()) { - qnt = ConvolutionCommon::load(deconv, backend, true); + qnt = ConvolutionCommon::load(op, backend, true); } auto kw = common->kernelX(); auto kh = common->kernelY(); @@ -195,7 +195,7 @@ void weightForDeconv(std::shared_ptr t, bool depthwise, const Convo auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common()); const int padX = pad.first; const int padY = pad.second; - + // const buffer auto deltaKy = leastCommonMultiple(mDilateY, mStrideY) / mDilateY; auto deltaKx = leastCommonMultiple(mDilateX, mStrideX) / mDilateX; @@ -227,7 +227,7 @@ void weightForDeconv(std::shared_ptr t, bool depthwise, const Convo mActivationType }; mConstBuffer = [context newDeviceBuffer:sizeof(consts) bytes:consts access:CPUWriteOnly]; - + mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger) ow, (NSUInteger)oh, (NSUInteger)oz * ob)]; return NO_ERROR; } diff --git a/source/backend/nnapi/execution/NNAPIConvolution.cpp b/source/backend/nnapi/execution/NNAPIConvolution.cpp index 3c064b141..4aecae0d2 100644 --- a/source/backend/nnapi/execution/NNAPIConvolution.cpp +++ b/source/backend/nnapi/execution/NNAPIConvolution.cpp @@ -95,7 +95,7 @@ ErrorCode NNAPIConvolution::onResize(const std::vector &inputs, const weightPtr = conv2D->quanParameter()->buffer()->data(); weightSize = conv2D->quanParameter()->buffer()->size(); } else if (nullptr != conv2D->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2D, backend(), true); + quanCommon = ConvolutionCommon::load(mOp, backend(), true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str()); } diff --git a/source/backend/opencl/CMakeLists.txt b/source/backend/opencl/CMakeLists.txt index ace6942cf..fc2fbdf1e 100644 --- a/source/backend/opencl/CMakeLists.txt +++ b/source/backend/opencl/CMakeLists.txt @@ -26,7 +26,7 @@ ENDIF() if (${CMAKE_SYSTEM_NAME} MATCHES "Android") add_definitions(-DMNN_USE_LIB_WRAPPER) add_definitions(-DMNN_OPENCL_SVM_ENABLE) - add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=120) + add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=110) else() if(${CMAKE_SYSTEM_NAME} MATCHES "Windows" OR ${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR MNN_SUPPORT_INTEL_SUBGROUP) add_definitions(-DMNN_SUPPORT_INTEL_SUBGROUP) @@ -36,7 +36,7 @@ else() else() add_definitions(-DMNN_USE_LIB_WRAPPER) add_definitions(-DMNN_OPENCL_SVM_ENABLE) - add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=120) + add_definitions(-DCL_HPP_TARGET_OPENCL_VERSION=110) endif() endif() IF(MNN_SEP_BUILD) diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp index d9bc65fab..b08f6a5cf 100644 --- a/source/backend/opencl/core/OpenCLBackend.cpp +++ b/source/backend/opencl/core/OpenCLBackend.cpp @@ -596,7 +596,7 @@ void OpenCLBackend::_allocHostBuffer(int length, const Tensor* srcTensor) const mDeviceBuffer = (cl::Buffer*)srcTensor->buffer().device; } #ifdef __ANDROID__ - else if(memType == MNN_FORWARD_OPENGL){ + else if(memType == MNN_FORWARD_OPENGL && mOpenCLRuntime->isSupportGL()){ cl_int error; mDeviceTexture.reset(new cl::ImageGL(mOpenCLRuntime->context(), CL_MEM_READ_WRITE, GL_TEXTURE_2D, 0, (cl_GLuint)srcTensor->buffer().device, &error)); std::vector map = {*mDeviceTexture.get()}; @@ -671,7 +671,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens #ifndef MNN_OPENCL_BUFFER_CLOSED if(mOpenCLRuntime->getGpuMemType() == BUFFER) { - if(MNN_FORWARD_OPENGL == memtype){ + if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){ OpenCL::convertNC4HW4BufferToImage(srcTensor, const_cast(dstTensor), mOpenCLRuntime.get(), false, svmFlag); std::vector map = {openCLImage(dstTensor)}; mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL); @@ -722,7 +722,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens else #endif /* MNN_OPENCL_BUFFER_CLOSED */ { - if(MNN_FORWARD_OPENGL == memtype){ + if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){ std::vector bufferShape = MNN::OpenCL::tensorShapeFormat(srcTensor); mOpenCLRuntime.get()->commandQueue().enqueueCopyImage( @@ -784,7 +784,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor #ifndef MNN_OPENCL_BUFFER_CLOSED if(mOpenCLRuntime->getGpuMemType() == BUFFER) { - if(MNN_FORWARD_OPENGL == memtype){ + if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){ OpenCL::convertImageToNC4HW4Buffer(srcTensor, const_cast(dstTensor),mOpenCLRuntime.get(), false, svmFlag); std::vector map = {openCLImage(srcTensor)}; mOpenCLRuntime->commandQueue().enqueueReleaseGLObjects(&map, NULL); @@ -821,7 +821,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor else #endif /* MNN_OPENCL_BUFFER_CLOSED */ { - if(MNN_FORWARD_OPENGL == memtype){ + if(MNN_FORWARD_OPENGL == memtype && mOpenCLRuntime->isSupportGL()){ std::vector bufferShape = MNN::OpenCL::tensorShapeFormat(dstTensor); mOpenCLRuntime.get()->commandQueue().enqueueCopyImage( @@ -880,7 +880,11 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr); } #else - mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr); + auto res = mOpenCLRuntime->commandQueue().enqueueWriteBuffer(*mHostBuffer.second, CL_TRUE, 0, needSize, hostPtr); + if(res != CL_SUCCESS) { + MNN_ERROR("OpenCL enqueue write error:%d\n", res); + return; + } #endif //Covert format @@ -902,6 +906,10 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst MNN_PRINT("Unsupport ForwardType %d for OpenCL backend!\n", memType); return; } + if(mOpenCLRuntime->isSupportGL() && MNN_FORWARD_OPENGL == memType){ + MNN_PRINT("This Device can not find OpenCL GL_EXTENTION function!\n"); + return; + } _allocHostBuffer(0, copyTensor); MNN::Tensor interTensor(copyTensor, copyTensor->getDimensionType(), false); @@ -912,10 +920,6 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst }else{ interTensor.buffer().device = (uint64_t)mHostBuffer.second.get(); } - if(OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isGlError() && MNN_FORWARD_OPENGL == memType){ - MNN_PRINT("This Device can not find OpenCL GL_EXTENTION function!\n"); - return; - } //Covert format MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(copyTensor)->dimensionFormat; if(MNN_FORWARD_CPU != srcMemtype){ diff --git a/source/backend/opencl/core/OpenCLGemmTune.cpp b/source/backend/opencl/core/OpenCLGemmTune.cpp index 00cd3ed98..388fba6f0 100644 --- a/source/backend/opencl/core/OpenCLGemmTune.cpp +++ b/source/backend/opencl/core/OpenCLGemmTune.cpp @@ -135,7 +135,7 @@ std::vector getGemmParams(const std::vector &gemmSize, const MNN_ASSERT(gemmSize[1] % 16 == 0); MNN_ASSERT(gemmSize[2] % 4 == 0); - MNN_ASSERT((gemmSize[5] == 0 && tensorMemory.size() == 3) || (gemmSize[5] == 1 && tensorMemory.size() == 4)); + MNN_ASSERT((gemmSize[5] == 0 && tensorMemory.size() == 3) || (gemmSize[5] >= 1 && tensorMemory.size() == 4)); auto& tunedGemmParams = runtime->tunedGemmParamsMap(); std::vector info(gemmSize); @@ -292,8 +292,8 @@ std::vector getGemmParams(const std::vector &gemmSize, const buildOptions.emplace(" -DRELAX_WORKGROUP_SIZE=1"); } - if(gemmSize[5] == 1) { - buildOptions.emplace(" -DBIAS"); + if(gemmSize[5] >= 1) { + buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string((int)gemmSize[5])); } int localM = mdimc; @@ -346,6 +346,8 @@ std::vector getGemmParams(const std::vector &gemmSize, const if(gemmSize[5] == 1) { ret |= kernel->get().setArg(idx++, tensorMemory[3]); ret |= kernel->get().setArg(idx++, gemmSize[1]); + } else if(gemmSize[5] > 1) { + MNN_ERROR("BatchGemm with bias type > 1 (elementwise) not supported! please check\n"); } ret |= kernel->get().setArg(idx++, tensorMemory[2]); ret |= kernel->get().setArg(idx++, batch_offset_c); @@ -362,16 +364,19 @@ std::vector getGemmParams(const std::vector &gemmSize, const int offset_a = 0; int offset_b = 0; int offset_c = 0; - + int offset[4] = {0, 0, 0, 0}; + int stride[4] = {(int)gemmSize[0], (int)gemmSize[1], (int)gemmSize[1], (int)gemmSize[1]}; + if(gemmSize[3] < 4) { + stride[2] = gemmSize[0]; // output: [N, M] + } ret |= kernel->get().setArg(idx++, tensorMemory[0]); ret |= kernel->get().setArg(idx++, tensorMemory[1]); - if(gemmSize[5] == 1) { + if(gemmSize[5] >= 1) { ret |= kernel->get().setArg(idx++, tensorMemory[3]); } ret |= kernel->get().setArg(idx++, tensorMemory[2]); - ret |= kernel->get().setArg(idx++, offset_a); - ret |= kernel->get().setArg(idx++, offset_b); - ret |= kernel->get().setArg(idx++, offset_c); + ret |= kernel->get().setArg(idx++, offset); + ret |= kernel->get().setArg(idx++, stride); MNN_CHECK_CL_SUCCESS(ret, "setArg getGemmParams Xgemm Kernel"); diff --git a/source/backend/opencl/core/OpenCLRunningUtils.cpp b/source/backend/opencl/core/OpenCLRunningUtils.cpp index 3898224d3..549d2be10 100644 --- a/source/backend/opencl/core/OpenCLRunningUtils.cpp +++ b/source/backend/opencl/core/OpenCLRunningUtils.cpp @@ -589,5 +589,21 @@ bool localWSTune(const std::mappreParamsMap(); + if (preParamInfo.find(preParamName) != preParamInfo.end()) { + *preParamData = preParamInfo[preParamName]; + return true; + } + return false; +} + +void setPreParamInfo(const std::string preParamName, uint32_t preParamData, OpenCLRuntime *runtime){ + auto& preParamInfo = runtime->preParamsMap(); + if (preParamInfo.find(preParamName) == preParamInfo.end()) { + preParamInfo.insert(std::make_pair(preParamName, preParamData)); + } +} + } // namespace OpenCL } // namespace MNN diff --git a/source/backend/opencl/core/OpenCLRunningUtils.hpp b/source/backend/opencl/core/OpenCLRunningUtils.hpp index 63c3fd7df..f9a911beb 100644 --- a/source/backend/opencl/core/OpenCLRunningUtils.hpp +++ b/source/backend/opencl/core/OpenCLRunningUtils.hpp @@ -126,6 +126,10 @@ bool localWSTune(const std::map, uint32_t> localWS2DDefault(const std::vector &gws, const uint32_t maxWorkGroupSize, OpenCLRuntime *runtime, const std::string &kernelName, const std::shared_ptr &mKernel); +bool getPreParamInfo(const std::string preParamName, uint32_t *preParamData, OpenCLRuntime *runtime); + +void setPreParamInfo(const std::string preParamName, uint32_t preParamData, OpenCLRuntime *runtime); + void copyBufferToImage(OpenCLRuntime *runtime, const cl::Buffer &buffer, const cl::Image &image, int w, int h); } // namespace OpenCL diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp index 7d6c4e5de..51ba62619 100644 --- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp +++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp @@ -195,6 +195,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const mIsDeviceSupportedLowPower = (mIsDeviceSupportedLowPower || isPriorityHint); #ifdef MNN_USE_LIB_WRAPPER + mIsSupportGL = !OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isGlError(); if(isPriorityHint) { if(true == OpenCLSymbolsOperator::getOpenclSymbolsPtr()->isPropError()) @@ -413,6 +414,10 @@ unsigned int OpenCLRuntime::getQueueNum() { return mQueueCount; } +std::map& OpenCLRuntime::preParamsMap(){ + return mPreParams; +} + std::map, std::vector>& OpenCLRuntime::tunedGemmParamsMap() { return mTunedGemmParams; } @@ -863,6 +868,14 @@ std::pair OpenCLRuntime::makeCache(void* tuneInfo) { cache->gemm.emplace_back(std::move(tuning)); } + // Get All PreParam cache + for(auto& iter : mPreParams){ + std::unique_ptr info(new PreParamInfoT); + info->preParamName = iter.first; + info->preParamData = iter.second; + cache->preParam.emplace_back(std::move(info)); + } + flatbuffers::FlatBufferBuilder builder; auto lastOffset = Cache::Pack(builder, cache.get()); builder.Finish(lastOffset); @@ -964,6 +977,19 @@ bool OpenCLRuntime::setCache(std::pair cache) { mTunedGemmParams.insert(std::make_pair(info, params)); } } + + //Load PreParam Info + if(nullptr != cacheBuffer->preParam()){ + auto preParamInfo = cacheBuffer->preParam(); + for(int i = 0; i < preParamInfo->size(); ++i){ + auto info = preParamInfo->GetAs(i); + if (nullptr == info->preParamName()) { + MNN_ERROR("Error preParam info\n"); + return false; + } + mPreParams.insert(std::make_pair(info->preParamName()->str(), info->preParamData())); + } + } return true; } diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp index 13f40a6c6..4a85de8bd 100644 --- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp +++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp @@ -109,6 +109,9 @@ class OpenCLRuntime { float getCLVersion() { return mCLVersion; } + bool isSupportGL(){ + return mIsSupportGL; + } #ifdef MNN_OPENCL_SVM_ENABLE cl_device_svm_capabilities getSvmCapabilities() { return mSvmCapabilities; @@ -141,6 +144,8 @@ class OpenCLRuntime { unsigned int mKernelTime = 0; + std::map& preParamsMap(); + std::map, std::vector>& tunedGemmParamsMap(); std::map>, std::pair, uint32_t>>& tunedLwsMap(); @@ -209,6 +214,7 @@ class OpenCLRuntime { bool mSupportDotInt8 = false; bool mSupportDotAccInt8 = false; bool mSupportedIntelSubgroup = false; + bool mIsSupportGL = true; GpuType mGpuType; MaliAr mMaliAr; float mCLVersion = 1.0f; @@ -228,6 +234,7 @@ class OpenCLRuntime { double mStartNanos; double mStopNanos; + std::map mPreParams; std::map, std::vector> mTunedGemmParams; std::map>, std::pair, uint32_t>> mTunedLws; std::map, std::pair, uint32_t>>>> mTuneLws; diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp index deaf9b627..8dc9957cf 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp @@ -127,9 +127,6 @@ bool OpenCLSymbols::isGlError() { return mGlError; } -bool OpenCLSymbols::isCL1_2Error() { - return mCL_12Error; -} bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { #if defined(WIN32) @@ -157,11 +154,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { mQcomError = true; \ } -#define MNN_LOAD_CL_12_PTR(func_name) func_name = reinterpret_cast(GetProcAddress(handle_, #func_name)); \ - if(func_name == nullptr){ \ - mCL_12Error = true; \ - } - #define MNN_LOAD_GL_PTR(func_name) func_name = reinterpret_cast(GetProcAddress(handle_, #func_name)); \ if(func_name == nullptr){ \ mGlError = true; \ @@ -213,14 +205,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { mQcomError = true; \ } -#define MNN_LOAD_CL_12_PTR(func_name) func_name = reinterpret_cast(dlsym(handle_, #func_name)); \ - if(func_name == nullptr && loadOpenCLPointer != nullptr){ \ - func_name = reinterpret_cast(loadOpenCLPointer(#func_name)); \ - } \ - if(func_name == nullptr){ \ - mCL_12Error = true; \ - } - #define MNN_LOAD_GL_PTR(func_name) func_name = reinterpret_cast(dlsym(handle_, #func_name)); \ if(func_name == nullptr && loadOpenCLPointer != nullptr){ \ func_name = reinterpret_cast(loadOpenCLPointer(#func_name)); \ @@ -282,9 +266,6 @@ bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { MNN_LOAD_GL_PTR(clCreateFromGLTexture); MNN_LOAD_GL_PTR(clEnqueueAcquireGLObjects); MNN_LOAD_GL_PTR(clEnqueueReleaseGLObjects); - MNN_LOAD_CL_12_PTR(clCreateImage); - MNN_LOAD_CL_12_PTR(clRetainDevice); - MNN_LOAD_CL_12_PTR(clReleaseDevice); MNN_LOAD_PROP_PTR(clCreateCommandQueueWithProperties); MNN_LOAD_SVM_PTR(clSVMAlloc); @@ -664,12 +645,6 @@ cl_int CL_API_CALL clFinish(cl_command_queue command_queue) { return func(command_queue); } -cl_mem CL_API_CALL clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) { - auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateImage; - MNN_CHECK_NOTNULL(func); - return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); -} - cl_mem CL_API_CALL clCreateImage2D(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t imageWidth, size_t imageHeight, size_t image_row_pitch, void *host_ptr, cl_int *errcode_ret) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateImage2D; @@ -740,18 +715,6 @@ cl_int CL_API_CALL clEnqueueReleaseGLObjects(cl_command_queue command_queue, return func(command_queue, num_objects, mem_objects, num_events_in_wait_list, event_wait_list, event); } -cl_int CL_API_CALL clRetainDevice(cl_device_id device){ - auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clRetainDevice; - MNN_CHECK_NOTNULL(func); - return func(device); -} - -cl_int CL_API_CALL clReleaseDevice(cl_device_id device){ - auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clReleaseDevice; - MNN_CHECK_NOTNULL(func); - return func(device); -} - // clCreateCommandQueueWithProperties wrapper cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device, const cl_queue_properties *properties, cl_int *errcode_ret) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clCreateCommandQueueWithProperties; diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp index e3617d92d..561ccde8c 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp @@ -17,9 +17,8 @@ #endif #include #include "core/Macro.h" -#define CL_TARGET_OPENCL_VERSION 200 -#define CL_HPP_TARGET_OPENCL_VERSION 120 -#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 110 +#define CL_HPP_MINIMUM_OPENCL_VERSION 110 #if !defined(_MSC_VER) #pragma GCC diagnostic push @@ -54,7 +53,6 @@ class OpenCLSymbols { bool isSvmError(); bool isPropError(); bool isQcomError(); - bool isCL1_2Error(); bool isGlError(); using clGetPlatformIDsFunc = cl_int (CL_API_CALL *)(cl_uint, cl_platform_id *, cl_uint *); @@ -188,7 +186,6 @@ class OpenCLSymbols { MNN_CL_DEFINE_FUNC_PTR(clReleaseKernel); MNN_CL_DEFINE_FUNC_PTR(clCreateProgramWithSource); MNN_CL_DEFINE_FUNC_PTR(clCreateBuffer); - MNN_CL_DEFINE_FUNC_PTR(clCreateImage); MNN_CL_DEFINE_FUNC_PTR(clCreateImage2D); MNN_CL_DEFINE_FUNC_PTR(clRetainKernel); MNN_CL_DEFINE_FUNC_PTR(clCreateKernel); @@ -232,8 +229,6 @@ class OpenCLSymbols { MNN_CL_DEFINE_FUNC_PTR(clCreateFromGLTexture); MNN_CL_DEFINE_FUNC_PTR(clEnqueueAcquireGLObjects); MNN_CL_DEFINE_FUNC_PTR(clEnqueueReleaseGLObjects); - MNN_CL_DEFINE_FUNC_PTR(clRetainDevice); - MNN_CL_DEFINE_FUNC_PTR(clReleaseDevice); MNN_CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties); MNN_CL_DEFINE_FUNC_PTR(clSVMAlloc); diff --git a/source/backend/opencl/execution/buffer/CastBufExecution.cpp b/source/backend/opencl/execution/buffer/CastBufExecution.cpp index ee7e51d35..dd4debd80 100644 --- a/source/backend/opencl/execution/buffer/CastBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/CastBufExecution.cpp @@ -60,6 +60,7 @@ ErrorCode CastBufExecution::onEncode(const std::vector& inputs, const s openCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]}; + return NO_ERROR; } diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp index 8eb739f28..ba25bda93 100644 --- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp @@ -36,7 +36,7 @@ ConvBufCommonExecution::ConvBufCommonExecution(const Convolution2D *conv2dParams mResource->mBias.reset(Tensor::createDevice({1, 1, 1, ROUND_UP(biasSize, 32)})); backend->onAcquireBuffer(mResource->mBias.get(), Backend::STATIC); cl::Buffer &biasBuffer = openCLBuffer(mResource->mBias.get()); - + cl_int res; auto biasPtrCL = openclBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer( biasBuffer, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); @@ -103,7 +103,7 @@ ConvBufExecution::ConvBufExecution(const std::vector &inputs, const st auto padding = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mResource->mConv2dCommonParams); mPaddings[0] = padding.second;//padY mPaddings[1] = padding.first;//padX - + mResource->mKernelWidth = conv2dCommonParams->kernelX(); mResource->mKernelHeight = conv2dCommonParams->kernelY(); mResource->mOutputChannel = conv2dCommonParams->outputCount(); @@ -116,7 +116,7 @@ ConvBufExecution::ConvBufExecution(const std::vector &inputs, const st mResource->mRasterExe.reset(new RasterBufExecution({mResource->mFilter.get()}, op, mOpenCLBackend)); } else { int weightSize = 0; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &mFilterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &mFilterDataPtr, &weightSize); //select opt conv method bool isConv1x1 = (mResource->mKernelHeight == mResource->mKernelWidth && mResource->mKernelHeight == 1 && mPaddings[0] == 0 && mPaddings[1] == 0 && mResource->mStrides[0] == 1 && mResource->mStrides[1] == 1); @@ -132,7 +132,6 @@ ConvBufExecution::ConvBufExecution(const std::vector &inputs, const st // Tile Match with mConvGemmOptLevel == 2 int tileK = 4; int tileN = 32; - int buffer_size = ROUND_UP(mResource->mOutputChannel, tileN) * ROUND_UP(mResource->mInputChannel, tileK); mResource->mFilter.reset( Tensor::createDevice({buffer_size})); @@ -176,7 +175,7 @@ ConvBufExecution::ConvBufExecution(const std::vector &inputs, const st std::vector filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)}; std::shared_ptr filterBuffer( Tensor::createDevice({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight})); - + int buffer_size = filterBuffer->elementSize() * sizeof(float); cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size); filterBuffer->buffer().device = (uint64_t)(&filterBufferCL); @@ -199,12 +198,12 @@ ConvBufExecution::ConvBufExecution(const std::vector &inputs, const st mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 4 * filterImageShape[0]})); mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC); MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()}; - + bool needTrans = true; bufferConvertor.convertToNC4HW4Buffer(filterBuffer.get(), MNN::OpenCL::CONV2D_FILTER, mResource->mFilter.get(), needTrans); } } - + if (mResource->mConv2dCommonParams->relu()) { mResource->mBuildOptions.emplace("-DRELU"); } else if (mResource->mConv2dCommonParams->relu6()) { @@ -270,17 +269,17 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const auto padding = ConvolutionCommon::convolutionPad(input, output, mResource->mConv2dCommonParams); mPaddings[0] = padding.second;//padY mPaddings[1] = padding.first;//padX - + // printf("nchw %d %d %d %d, cohw %d %d %d, khw %d %d gemm:%d \n", inputs[0]->batch(), inputs[0]->channel(), inputs[0]->height(), inputs[0]->width(), outputs[0]->channel(), outputs[0]->height(), outputs[0]->width(), mResource->mKernelWidth, mResource->mKernelHeight, mResource->mConvGemmOptLevel); - + std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel) + "_" + std::to_string(mResource->mKernelHeight) + "_" + std::to_string(mResource->mKernelWidth) + "_" + std::to_string(mResource->mStrides[0]) + "_" + std::to_string(mResource->mStrides[1]) + "_" + std::to_string(mResource->mDilations[0]) + "_" + std::to_string(mResource->mDilations[1]); - + if (mResource->mConvGemmOptLevel > 0) { int area = height * width; int M = outputShape.at(0) * area; int N = outputShape.at(3); int K = inputShape.at(3); - + bool isAlign = (K % 8 == 0 && area == 1 && N % 64 == 0 && M % 64 == 0); bool isLimitSize = (M * 1.0 / 512 * N / 512 * K / 512 <= 1.0) && (1.0 * M * K / N / N >= 16.0); if(isAlign && isLimitSize) { @@ -289,7 +288,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const mResource->mConvGemmOptLevel = 0; } } - + if (mResource->mConvGemmOptLevel == 2) { // set large tile int tileM = 16; @@ -300,27 +299,22 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const int M = outputShape.at(0) * area; int N = outputShape.at(3); int K = inputShape.at(3); - + int alignM = ROUND_UP(M, tileM); int alignN = ROUND_UP(N, tileN); int alignK = ROUND_UP(K, tileK); - + // ReArrange input mConvGemmInpTensor.reset(Tensor::createDevice({alignK * alignM})); mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); - if(N != alignN || M != alignM || area != 1) { - mNeedOutTempTensor = true; - mConvGemmOutTensor.reset(Tensor::createDevice({alignN * alignM})); - mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); - } - mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); - if(mNeedOutTempTensor) { - mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); - } + mNeedOutTempTensor = true; + mConvGemmOutTensor.reset(Tensor::createDevice({alignN * alignM})); + mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); + { std::set buildOptions; - + int m_pack = 1; if(area == 1) { m_pack = 4; @@ -352,89 +346,15 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const mPreGlobalWorkSize[0] = ROUND_UP(mPreGlobalWorkSize[0], std::max((uint32_t)1, mPreLocalWorkSize[0])); mPreGlobalWorkSize[1] = ROUND_UP(mPreGlobalWorkSize[1], std::max((uint32_t)1, mPreLocalWorkSize[1])); } - std::set buildOptions; - - uint32_t hasBias = 0; - if(!mNeedOutTempTensor) { - hasBias = 1; - buildOptions = mResource->mBuildOptions; - buildOptions.emplace("-DBIAS"); - } - uint32_t layout = 4; - uint32_t batch = 1; - - cl::Buffer outBuffer = mNeedOutTempTensor ? openCLBuffer(mConvGemmOutTensor.get()) : openCLBuffer(output); - std::vector param; - if(mNeedOutTempTensor) { - param = getGemmParams({(uint32_t)alignM, (uint32_t)alignN, (uint32_t)alignK, layout, batch, hasBias}, {openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(mConvGemmOutTensor.get())}, mOpenCLBackend->getOpenCLRuntime()); - } else { - param = getGemmParams({(uint32_t)alignM, (uint32_t)alignN, (uint32_t)alignK, layout, batch, hasBias}, {openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(output), openCLBuffer(mResource->mBias.get())}, mOpenCLBackend->getOpenCLRuntime()); - } - int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13]; - buildOptions.emplace("-DKWG=" + std::to_string(KWG)); - buildOptions.emplace("-DKWI=" + std::to_string(KWI)); - buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA)); - buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC)); - buildOptions.emplace("-DMWG=" + std::to_string(MWG)); - buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB)); - buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC)); - buildOptions.emplace("-DNWG=" + std::to_string(NWG)); - buildOptions.emplace("-DSA=" + std::to_string(SA)); - buildOptions.emplace("-DSB=" + std::to_string(SB)); - buildOptions.emplace("-DSTRM=" + std::to_string(STRM)); - buildOptions.emplace("-DSTRN=" + std::to_string(STRN)); - buildOptions.emplace("-DVWM=" + std::to_string(VWM)); - buildOptions.emplace("-DVWN=" + std::to_string(VWN)); - if(layout >= 4) { - buildOptions.emplace("-DOUTPUTMN"); - } - - tileM = MWG; - tileN = NWG; - int localM = MDIMC; - int localN = NDIMC; - - if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) { - buildOptions.emplace("-DUSE_CL_MAD=1"); - buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1"); - } - - mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "Xgemm", buildOptions); - - int out_per_thread_m = tileM / localM; - int out_per_thread_n = tileN / localN; - - mGlobalWorkSize = {static_cast(alignM/out_per_thread_m), static_cast(alignN/out_per_thread_n)}; - mLocalWorkSize = {static_cast(localM), static_cast(localN)}; - - float alpha = 1.0; - float beta = 0.0f; - int offset = 0; - int idx = 0; - cl_int ret = CL_SUCCESS; - ret |= mKernel->get().setArg(idx++, static_cast(alignM)); - ret |= mKernel->get().setArg(idx++, static_cast(alignN)); - ret |= mKernel->get().setArg(idx++, static_cast(alignK)); - ret |= mKernel->get().setArg(idx++, alpha); - ret |= mKernel->get().setArg(idx++, beta); - ret |= mKernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get())); - ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get())); - if(mNeedOutTempTensor) { - ret |= mKernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get())); - } else { - ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get())); - ret |= mKernel->get().setArg(idx++, openCLBuffer(output)); + // call gemm strassen + { + mStrassenComputor.reset(new StrassenMatrixComputor(backend(), 3)); + mStrassenComputor->onEncode(alignM, alignK, alignN, alignM, alignN, alignN, openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mResource->mFilter.get()), openCLBuffer(mConvGemmOutTensor.get()), + false, openCLBuffer(mResource->mBias.get())); } - ret |= mKernel->get().setArg(idx++, offset); - ret |= mKernel->get().setArg(idx++, offset); - ret |= mKernel->get().setArg(idx++, offset); - - MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf mConvgemmOptLevel==2 Kernel Select"); - mOpenCLBackend->recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize); - mGlobalWorkSize[0] = ROUND_UP(mGlobalWorkSize[0], std::max((uint32_t)1, mLocalWorkSize[0])); - mGlobalWorkSize[1] = ROUND_UP(mGlobalWorkSize[1], std::max((uint32_t)1, mLocalWorkSize[1])); + // call output transpose if(mNeedOutTempTensor) { std::set buildOptions = mResource->mBuildOptions; if(area == 1) { @@ -464,9 +384,15 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const mOpenCLBackend->recordKernel2d(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize); mPostGlobalWorkSize[0] = ROUND_UP(mPostGlobalWorkSize[0], std::max((uint32_t)1, mPostLocalWorkSize[0])); mPostGlobalWorkSize[1] = ROUND_UP(mPostGlobalWorkSize[1], std::max((uint32_t)1, mPostLocalWorkSize[1])); - + mOpenCLBackend->endRecord(mRecording); } + + mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); + if(mNeedOutTempTensor) { + mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); + } + return NO_ERROR; } else if (mResource->mConvGemmOptLevel == 1) { // set small tile @@ -489,11 +415,11 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const buildOptions.emplace(" -DOPWM=64 -DOPWN=64 -DCPWK=8 -DOPTM=4 -DOPTN=4"); } - + mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_local_buf", "matmul_local_buf", buildOptions); int out_per_thread_m = tileM / localM; int out_per_thread_n = tileN / localN; - + mGlobalWorkSize = {static_cast(M/out_per_thread_m), static_cast(N/out_per_thread_n)}; mLocalWorkSize = {static_cast(localM), static_cast(localN)}; @@ -509,14 +435,14 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf mConvgemmOptLevel==1 Kernel Select"); } else if (mResource->mConv1x1Opt) { - + int tileN = 32; // {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1", "conv_2d_1x1_c8h1w4"}; const int total_kernel = 3; std::string kernelName[total_kernel] = {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1"}; int itemC[total_kernel] = {4, 4, 4}; int itemW[total_kernel] = {4, 2, 1}; - + int actual_kernel = total_kernel; if(mResource->mConv1x1C8Opt) { actual_kernel = 2; @@ -528,7 +454,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const itemC[1] = 8; itemW[1] = 2; } - + std::shared_ptr kernel[total_kernel]; std::vector globalWorkSize[total_kernel]; std::vector localWorkSize[total_kernel]; @@ -543,11 +469,11 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const } kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption); uint32_t maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx])); - + uint32_t idx = 0; cl_int ret = CL_SUCCESS; globalWorkSize[knl_idx] = {static_cast(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast(outputShape.at(0) * outputShape.at(1))}; - + ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]); ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]); ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx])); @@ -575,7 +501,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const std::shared_ptr quanCommon; int min_index = min_cost.second; mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]}; - + std::set buildOption = mResource->mBuildOptions; if(outputShape.at(3) % itemC[min_index] != 0){ buildOption.emplace("-DCHANNEL_LEAVE"); @@ -609,18 +535,18 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const int strideShape[2] = {mResource->mStrides[0],mResource->mStrides[1]}; int paddingShape[2] = {mPaddings[0], mPaddings[1]}; int dilationShape[2] = {mResource->mDilations[0], mResource->mDilations[1]}; - + // {"conv_2d_c4h1w2", "conv_2d_c4h1w1", "conv_2d_c8h1w1", "conv_2d_c4h1w4", "conv_2d_c8h2w1", "conv_2d_c4h4w1"}; const int total_kernel = 7; std::string kernelName[total_kernel] = {"conv_2d_c4h1w1", "conv_2d_c4h1w2", "conv_2d_c4h4w1", "conv_2d_c8h2w1", "conv_2d_c8h4w1", "conv_2d_c4h1w4", "conv_2d_c8h1w4"}; int itemC[total_kernel] = {4, 4, 4, 8, 8, 4, 8}; int itemH[total_kernel] = {1, 1, 4, 2, 4, 1, 1}; int itemW[total_kernel] = {1, 2, 1, 1, 1, 4, 4}; - - + + int actual_kernel = total_kernel; - - + + std::shared_ptr kernel[total_kernel]; std::vector globalWorkSize[total_kernel]; std::vector localWorkSize[total_kernel]; @@ -635,7 +561,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const } kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption); uint32_t maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx])); - + globalWorkSize[knl_idx] = {static_cast(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast(outputShape.at(0) * UP_DIV(outputShape.at(1), itemH[knl_idx]))}; uint32_t idx = 0; cl_int ret = CL_SUCCESS; @@ -678,7 +604,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const buildOption.emplace("-DBLOCK_LEAVE"); } mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption); - + uint32_t idx = 0; cl_int ret = CL_SUCCESS; @@ -736,31 +662,36 @@ ErrorCode ConvBufExecution::onExecute(const std::vector &inputs, const runKernel2D(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event0); mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvBuf2D-gemm2-0", event0}); } - cl::Event event; - runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event); - std::string name = "ConvBuf2D"; - std::string b = std::to_string(inputs[0]->batch()); - std::string ci = std::to_string(inputs[0]->channel()); - std::string hi = std::to_string(inputs[0]->height()); - std::string wi = std::to_string(inputs[0]->width()); - std::string co = std::to_string(outputs[0]->channel()); - std::string ho = std::to_string(outputs[0]->height()); - std::string wo = std::to_string(outputs[0]->width()); - std::string kh = std::to_string(mResource->mKernelHeight); - std::string kw = std::to_string(mResource->mKernelWidth); - std::string total = std::to_string(1.0 / 1000000 * inputs[0]->batch() * inputs[0]->channel() * outputs[0]->channel() * outputs[0]->height() * outputs[0]->width() * mResource->mKernelHeight * mResource->mKernelWidth); - if (mResource->mConvGemmOptLevel > 0) { - std::string m = std::to_string(outputs[0]->width() * outputs[0]->height() * inputs[0]->batch()); - name += "-gemm"; - name += std::to_string(mResource->mConvGemmOptLevel) + "-m" + m + "n" + co + "k" + ci; - } else if (mResource->mConv1x1Opt) { - name += "-conv1x1"; - name += "-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co; + + if(mResource->mConvGemmOptLevel == 2) { + mStrassenComputor->onExecute(); } else { - name += "-ori-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co+ "ho" + ho + "wo" + wo + "kh" + kh + "kw" + kw; + cl::Event event; + runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event); + std::string name = "ConvBuf2D"; + std::string b = std::to_string(inputs[0]->batch()); + std::string ci = std::to_string(inputs[0]->channel()); + std::string hi = std::to_string(inputs[0]->height()); + std::string wi = std::to_string(inputs[0]->width()); + std::string co = std::to_string(outputs[0]->channel()); + std::string ho = std::to_string(outputs[0]->height()); + std::string wo = std::to_string(outputs[0]->width()); + std::string kh = std::to_string(mResource->mKernelHeight); + std::string kw = std::to_string(mResource->mKernelWidth); + std::string total = std::to_string(1.0 / 1000000 * inputs[0]->batch() * inputs[0]->channel() * outputs[0]->channel() * outputs[0]->height() * outputs[0]->width() * mResource->mKernelHeight * mResource->mKernelWidth); + if (mResource->mConvGemmOptLevel > 0) { + std::string m = std::to_string(outputs[0]->width() * outputs[0]->height() * inputs[0]->batch()); + name += "-gemm"; + name += std::to_string(mResource->mConvGemmOptLevel) + "-m" + m + "n" + co + "k" + ci; + } else if (mResource->mConv1x1Opt) { + name += "-conv1x1"; + name += "-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co; + } else { + name += "-ori-b" + b + "ci" + ci + "hi" + hi + "wi" + wi + "co" + co+ "ho" + ho + "wo" + wo + "kh" + kh + "kw" + kw; + } + name += "-total:" + total + "*10^6"; + mOpenCLBackend->getOpenCLRuntime()->pushEvent({name.c_str(), event}); } - name += "-total:" + total + "*10^6"; - mOpenCLBackend->getOpenCLRuntime()->pushEvent({name.c_str(), event}); if (mPostKernel) { cl::Event event2; runKernel2D(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event2); @@ -777,12 +708,17 @@ ErrorCode ConvBufExecution::onExecute(const std::vector &inputs, const if (mPreKernel) { runKernel2D(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); } - runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + + if(mResource->mConvGemmOptLevel == 2) { + mStrassenComputor->onExecute(); + } else { + runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); + } if (mPostKernel) { runKernel2D(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); } #endif - + #ifdef LOG_VERBOSE MNN_PRINT("end ConvExecution onExecute !\n"); #endif @@ -819,7 +755,7 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator { return nullptr; } } - } + } #endif if (nullptr != op->main_as_Convolution2D()->quanParameter()) { auto quan = op->main_as_Convolution2D()->quanParameter(); @@ -830,12 +766,12 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator { } } } - + if(op->main_as_Convolution2D()->common()->group() > 1){ // Don't support group > 1 now return nullptr; } - + if (inputs.size() > 1) { // Multi inputs for (int i = 0; i < inputs.size(); ++i) { diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp index e5abe2a53..96e1ec5aa 100644 --- a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp +++ b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp @@ -12,6 +12,8 @@ #define ConvBufExecution_hpp #include "backend/opencl/execution/image/CommonExecution.hpp" +#include "backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp" + namespace MNN { namespace OpenCL { @@ -82,6 +84,10 @@ class ConvBufExecution : public ConvBufCommonExecution, public CommonExecution { std::vector mPostGlobalWorkSize{1, 1, 1}; std::vector mPostLocalWorkSize{1, 1, 1, 1}; const float* mFilterDataPtr = nullptr; +private: + + std::shared_ptr mStrassenComputor; + }; } // namespace OpenCL diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp index 36a4afd24..c932c0a6c 100644 --- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp +++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp @@ -9,10 +9,9 @@ // #define LOG_VERBOSE namespace MNN { namespace OpenCL { - // set mDequantScale mDequantOffset mNumQuantBit mFilterDataPtr from mConv2dParams void ConvBufLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr & quanCommon) { - quanCommon = ConvolutionCommon::load(mResource->mConv2dParams, this->backend(), false, true); + quanCommon = ConvolutionCommon::load(mOp, this->backend(), false, true); if (mResource->mConv2dParams->quanParameter() != nullptr) { mLowMemoryFlag = true; } else { @@ -23,6 +22,7 @@ void ConvBufLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptrmNumQuantBit if(quanCommon->canUseInt4){ mResource->mNumQuantBit = 4; + mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel); }else{ mResource->mNumQuantBit = 8; } @@ -93,9 +93,9 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT4"); } else {/* More types to be supported. */} if(mResource->mInputChannel % pack != 0){ - buildOptions.emplace("-DINPUT_CHANNEL_LEAVE"); + buildOptions.emplace("-DCHANNEL_LEAVE"); } - + mBufferToConv1x1Kernel = runtime->buildKernelWithCache("buffer_convert_quant", kernelName, buildOptions); auto kernel = mBufferToConv1x1Kernel->get(); uint32_t gws[2] = {static_cast(UP_DIV(mResource->mInputChannel, pack)), static_cast(mResource->mOutputChannel)}; @@ -128,7 +128,7 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, res = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); - + event.wait(); MNN_CHECK_CL_SUCCESS(res, "convertToQuantWeight1x1Buffer"); @@ -141,9 +141,15 @@ bool ConvBufLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, // set mKernelBuffer for the 1x1 kernels void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr & quanCommon) { cl_int res; - std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, packCout), ROUND_UP(mResource->mInputChannel, packCin), mResource->mKernelWidth, mResource->mKernelHeight})); + std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, packCout), ROUND_UP(mResource->mInputChannel, packCin), 1, 1})); size_t buffer_size = filterBuffer->usize() / sizeof(float); - size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight * sizeof(char); + size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel; + // shared part for all cases + if (mResource->mNumQuantBit == 4){ + // int4 case + buffer_size /= 2; + cpy_size = UP_DIV(cpy_size, 2); + } else {/* More types to be supported. */} float *dequantAlpha = quanCommon->alpha.get(); cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size); void *mapPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); @@ -154,26 +160,19 @@ void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, MNN_ASSERT(false); } mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, mapPtr); - // shared part for all cases - if (mResource->mNumQuantBit == 8) { - // int8 case - buffer_size *= sizeof(int8_t); - } else if (mResource->mNumQuantBit == 4){ - // int4 case - buffer_size /= 2; - } else {/* More types to be supported. */} - + // Use Image load weights if(UP_DIV(mResource->mInputChannel, packCin) <= 16384 && ROUND_UP(mResource->mOutputChannel, packCout) <= 16384){ mResource->mUseImage = true; } if(mResource->mUseImage) { - if(mResource->mNumQuantBit == 4){ - packCin *= 2; - } size_t w = ROUND_UP(mResource->mOutputChannel, packCout); size_t h = UP_DIV(mResource->mInputChannel, packCin); - mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res)); + if(mResource->mNumQuantBit == 4){ + mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT16), w, h, 0, nullptr, &res)); + }else{ + mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res)); + } if (nullptr == mResource->mKernelImage.get() || res != CL_SUCCESS) { MNN_ERROR("Alloc Image %d x %d error, code:%d \n", (int)w, (int)h, (int)res); } @@ -185,11 +184,13 @@ void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, // set mFilter for the general kernels void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std::shared_ptr & quanCommon) { if (filterDataPtr != nullptr) { - std::vector filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)}; - std::shared_ptr filterBuffer(Tensor::createDevice({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight})); - // int buffer_size = filterBuffer->elementSize(); + std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, 4), mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight})); size_t buffer_size = filterBuffer->usize() / sizeof(float); - buffer_size *= sizeof(int8_t); + size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight; + if (mResource->mNumQuantBit == 4){ + buffer_size /= 2; + cpy_size = UP_DIV(cpy_size, 2); + } cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size); filterBuffer->buffer().device = (uint64_t)(&filterBufferCL); float *dequantAlpha = quanCommon->alpha.get(); @@ -197,14 +198,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s cl_int res; auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); if(ptrCL != nullptr && res == CL_SUCCESS) { - ::memset(ptrCL, 0, buffer_size); - const int copy_size = mResource->mKernelWidth * mResource->mKernelHeight * sizeof(int8_t); - for(int oc=0; ocmOutputChannel; oc++) { - int ic = 0; - for(; icmInputChannel; ic++) { - ::memcpy((int8_t *)ptrCL + (oc * ROUND_UP(mResource->mInputChannel, 4) + ic) * mResource->mKernelWidth * mResource->mKernelHeight, ((int8_t *)filterDataPtr) + (oc * mResource->mInputChannel + ic) * mResource->mKernelWidth * mResource->mKernelHeight, copy_size); - } - } + ::memcpy(ptrCL, filterDataPtr, cpy_size); } else { MNN_ERROR("setGeneralWeightLowMemory: Map error ptrCL == nullptr \n"); } @@ -212,7 +206,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s // convert to NC4HW4 if (mResource->mNumQuantBit == 8) { // ROUND_UP(IC, 4), UP_DIV(OC, 4) * mKernelWidth * mKernelHeight - mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 4 * filterImageShape[0]})); + mResource->mFilter.reset(Tensor::createDevice({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 4 * ROUND_UP(mResource->mInputChannel, 4)})); mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC); MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()}; // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight} @@ -222,7 +216,7 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s // For int4 case, data stored in mFilter should be uint8_t, // while "Tensor::createDevice" occupies more memory than "Tensor::createDevice". // Therefore, we use "Tensor::createDevice" currently, leaving "Tensor::createDevice" to be supported. - mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 2 * filterImageShape[0]})); + mResource->mFilter.reset(Tensor::createDevice({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 2 * ROUND_UP(mResource->mInputChannel, 4)})); mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC); MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()}; // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight} @@ -352,8 +346,9 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]}; return; } -void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output) { +unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output) { auto &unit = mUnits[0]; + unsigned int total_time = 0; std::vector inputShape = tensorShapeFormat(input); std::vector outputShape = tensorShapeFormat(output); const int outChannel = outputShape.at(3); @@ -379,7 +374,7 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu if(width == 1 && height == 1){ buildOption.emplace("-DWIDTH_HEIGHT_1"); } - + if(blockDim % 16 != 0){ buildOption.emplace("-DINPUT_CHANNEL_LEAVE"); } else if (mResource->mUseImage && mResource->mNumQuantBit == 4 && blockDim % 32 != 0) { @@ -401,7 +396,7 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu for (; knl_idx < actual_kernel; knl_idx++) { kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[knl_idx], buildOption); uint32_t maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx])); - + globalWorkSize[knl_idx] = {static_cast(UP_DIV(outChannel, itemC[knl_idx]) * width), static_cast(global_y)}; uint32_t idx = 0; cl_int ret = CL_SUCCESS; @@ -433,10 +428,11 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu mLocalWorkSize = {retTune.first[0], retTune.first[1]}; } } + total_time += min_cost.first; int min_index = min_cost.second; mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]}; - - + + unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[min_index], buildOption); //MNN_PRINT("Kernel is %d.\n", min_index); uint32_t idx = 0; @@ -464,10 +460,11 @@ void ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * outpu mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]}; - return; + return total_time; } -void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * output) { +unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * output) { mUnits.resize(3); + unsigned int total_time = 0; std::vector inputShape = tensorShapeFormat(input); std::vector outputShape = tensorShapeFormat(output); const int outChannel = outputShape.at(3); @@ -478,16 +475,16 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * const int outputChannelBlocks = UP_DIV(outChannel, 4); const int blockNum = mResource->mBlockSize; const int blockDim = mResource->mInputChannel / mResource->mBlockSize; - + int global_y = UP_DIV(batch, 4) * width_height; - const int total_kernel = 5; - std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf", "gemm_b4_c1_image", "gemm_b4_c2_image"}; - int itemC[total_kernel] = {1, 2, 4, 1, 2}; + const int total_kernel = 6; + std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf", "gemm_b4_c1_image", "gemm_b4_c2_image", "gemm_b4_c4_image"}; + int itemC[total_kernel] = {1, 2, 4, 1, 2, 4}; int actual_kernel = total_kernel; std::shared_ptr kernel[total_kernel]; std::vector globalWorkSize[total_kernel]; std::vector localWorkSize[total_kernel]; - std::pair min_cost(INT_MAX, 0);//(min_time, min_index) + std::pair min_cost(INT_MAX, 0);//(min_time, min_index) std::set buildOption = mResource->mBuildOptions; if(blockDim % 16 != 0){ buildOption.emplace("-DINPUT_CHANNEL_LEAVE"); @@ -510,7 +507,7 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * mGlobalWorkSize = {static_cast(UP_DIV(mResource->mInputChannel, 4)), static_cast(UP_DIV(batch, 4)), static_cast(width_height)}; unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", "reshape_nchw4_nhwc4", buildOption); uint32_t maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel)); - + uint32_t idx = 0; cl_int ret = CL_SUCCESS; ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); @@ -523,7 +520,9 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * ret |= unit.kernel->get().setArg(idx++, static_cast(inputChannels)); ret |= unit.kernel->get().setArg(idx++, static_cast(inputChannelBlocks)); MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_nc4_cn4"); - mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel).first; + std::pair, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel); + total_time += retTune.second; + mLocalWorkSize = retTune.first; mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; @@ -540,7 +539,7 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * for (; knl_idx < actual_kernel; knl_idx++) { kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[knl_idx], buildOption); uint32_t maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx])); - + globalWorkSize[knl_idx] = {static_cast(UP_DIV(outChannel, itemC[knl_idx])), static_cast(global_y)}; uint32_t idx = 0; cl_int ret = CL_SUCCESS; @@ -560,18 +559,19 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * ret |= kernel[knl_idx]->get().setArg(idx++, static_cast(blockNum)); ret |= kernel[knl_idx]->get().setArg(idx++, static_cast(blockDim)); MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf Kernel Select"); - std::pair, int> retTune; + std::pair, unsigned int> retTune; retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]); - if(min_cost.first > retTune.second) { + if(min_cost.first > retTune.second) { min_cost.first = retTune.second; min_cost.second = knl_idx; mLocalWorkSize = {retTune.first[0], retTune.first[1]}; } } + total_time += min_cost.first; int min_index = min_cost.second; mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]}; - - + + unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[min_index], buildOption); //MNN_PRINT("Kernel is %d.\n", min_index); uint32_t idx = 0; @@ -613,12 +613,14 @@ void ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * ret |= unit.kernel->get().setArg(idx++, static_cast(batch)); ret |= unit.kernel->get().setArg(idx++, static_cast(outputChannelBlocks)); MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_cn4_nc4"); - mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel).first; + std::pair, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel); + mLocalWorkSize = retTune.first; + total_time += retTune.second; mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; } - return; + return total_time; } ConvBufLowMemoryExecution::ConvBufLowMemoryExecution(const std::vector &inputs, const std::vector &outputs, const MNN::Op *op, Backend *backend) : ConvBufCommonExecution(op->main_as_Convolution2D(), backend), CommonExecution(backend, op) { @@ -706,9 +708,17 @@ ErrorCode ConvBufLowMemoryExecution::onEncode(const std::vector &input // onclone default use conv1x1Opt, need reset std::vector outputShape = tensorShapeFormat(output); const int batch = outputShape.at(0); - bool isMali = mOpenCLBackend->getOpenCLRuntime()->getGpuType() == MALI; + auto runTime = mOpenCLBackend->getOpenCLRuntime(); if (mResource->mConv1x1Opt) { - if(batch > 1 && isMali){ + if(batch > 1 && false == getPreParamInfo("ConvBufLowMemoryPreArrangeMode", &batchConvMode, runTime)){ + if(tuneGemvBatchLowMemory(input, output) < tuneGemmLowMemory(input, output)){ + batchConvMode = 1; + } else{ + batchConvMode = 2; + } + setPreParamInfo("ConvBufLowMemoryPreArrangeMode", batchConvMode, runTime); + } + if(batch > 1 && batchConvMode == 1){ tuneGemvBatchLowMemory(input, output); }else{ tuneGemmLowMemory(input, output); diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp index de0938c7b..8488f461b 100644 --- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp +++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp @@ -28,8 +28,8 @@ class ConvBufLowMemoryExecution : public ConvBufCommonExecution, public CommonEx void set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr & quanCommon); void setGeneralWeightLowMemory(void * filterDataPtr, std::shared_ptr & quanCommon); void tuneGeneralCaseLowMemory(Tensor * input, Tensor * output); - void tuneGemmLowMemory(Tensor * input, Tensor * output); - void tuneGemvBatchLowMemory(Tensor * input, Tensor * output); + unsigned int tuneGemmLowMemory(Tensor * input, Tensor * output); + unsigned int tuneGemvBatchLowMemory(Tensor * input, Tensor * output); bool convertToQuantWeight1x1Buffer(cl::Buffer input, int pack); std::vector mPaddings{0, 0}; std::vector mGlobalWorkSize{1, 1, 1}; @@ -39,6 +39,7 @@ class ConvBufLowMemoryExecution : public ConvBufCommonExecution, public CommonEx std::shared_ptr mConvGemmInpTensor; std::shared_ptr mConvGemmOutTensor; std::shared_ptr mBufferToConv1x1Kernel = nullptr; + uint32_t batchConvMode = 0; // batch > 1 convolution input arrage mode. 0 is need tune; 1 arrage to n/4chw4; 2 arrage to c/4hwn4 }; } // namespace OpenCL diff --git a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp index 17ca12a3e..c7b7fc644 100644 --- a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp +++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp @@ -85,7 +85,7 @@ ConvBufWinograd::ConvBufWinograd(const MNN::Op* op, Backend* backend) : CommonEx int weightSize = 0; const float* filterDataPtr = nullptr; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2D, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize); mCo = mResource->mCommon->outputCount(); mCi = weightSize / mCo / mResource->mCommon->kernelX() / mResource->mCommon->kernelY(); diff --git a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp index 142c49268..70685e7bb 100644 --- a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp @@ -110,7 +110,7 @@ ConvSubgroupBuf::ConvSubgroupBuf(const std::vector &inputs, const std: const float *FilterDataPtr = NULL; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &FilterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &FilterDataPtr, &weightSize); if (FilterDataPtr != nullptr) { std::shared_ptr sourceWeight( Tensor::create(std::vector{mResource->mOutputChannel, mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight}, @@ -149,7 +149,7 @@ ConvSubgroupBuf::ConvSubgroupBuf(const std::vector &inputs, const std: queue.enqueueUnmapMemObject(weightBuffer, weight_ptr); } - } + } { int biasSize = conv2dParams->common()->outputCount(); int buffer_size = ROUND_UP(biasSize, 16); // pack to 16 @@ -265,7 +265,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s const int inputHeight = inputShape.at(1); const int inputWidth = inputShape.at(2); const int inputChannels = inputShape.at(3); - + int input_width_pad = mResource->mStrides[1] * (8 - 1) + (mResource->mKernelWidth - 1) * mResource->mDilations[1] + 1 + width * mResource->mStrides[1] + mPaddings[1]; int input_height_pad = inputHeight + 2 * mPaddings[0]; uint32_t MaxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->MaxWorkGroupSize()); @@ -285,7 +285,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s uint32_t sub_group_size = 16; uint32_t slm_div_factor = tune_param.second; uint32_t work_group_size = sub_group_size * slm_div_factor; - uint32_t feature_block_size = 16; + uint32_t feature_block_size = 16; uint32_t input_line_size = strideShape[1] * (blockWidth - 1) + (kernelShape[1] - 1) * dilationShape[1] + 1; uint32_t input_block_size = UP_DIV(input_line_size * kernelShape[0] * dilationShape[0], sub_group_size); uint32_t x_blocks = UP_DIV(outputImageShape[1], blockWidth); @@ -303,9 +303,9 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC); mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC); unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("input_transe_buf", "conv_transe_c4_c1", {}); - + uint32_t mMaxWGS_S = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel)); - + mTranseGlobalWorkSize = {static_cast(inputWidth * inputHeight), static_cast(UP_DIV(inputShape.at(3), 4)), static_cast(inputShape.at(0))}; @@ -321,7 +321,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4)); unit.kernel->get().setArg(idx++, static_cast(inputpad.left)); unit.kernel->get().setArg(idx++, static_cast(inputpad.right)); - + mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c1", unit.kernel).first; mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize); } else { @@ -329,9 +329,9 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC); mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC); unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("input_transe_buf", "conv_transe_c4_c16", {}); - + uint32_t mMaxWGS_S = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel)); - + mTranseGlobalWorkSize = {static_cast(inputWidth * inputHeight), static_cast(UP_DIV(inputShape.at(3), 4)), static_cast(inputShape.at(0))}; @@ -347,7 +347,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4)); unit.kernel->get().setArg(idx++, static_cast(inputpad.left)); unit.kernel->get().setArg(idx++, static_cast(inputpad.right)); - + mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c16", unit.kernel).first; mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize); } @@ -355,7 +355,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s unit.localWorkSize = {mTranseLocalWorkSize[0], mTranseLocalWorkSize[1], mTranseLocalWorkSize[2]}; mUnits.emplace_back(unit); } - + Unit unit; if (inputChannels < 16 && in_c_pack == 4) { std::set buildOptions = mResource->mBuildOptions; @@ -410,7 +410,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector &inputs, const s #ifdef LOG_VERBOSE MNN_PRINT("end ConvSubgroupBuf onResize !\n"); #endif - + mOpenCLBackend->recordKernel3d(unit.kernel , mGlobalWorkSize, mLocalWorkSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; diff --git a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp index 4f1b990b7..096594ebc 100644 --- a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp @@ -35,7 +35,7 @@ DeconvBufExecution::DeconvBufExecution(const std::vector &inputs, cons const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize); int inputChannel = weightSize / (kernelWidth * kernelHeight * outputChannel); std::vector filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth}; diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp index 4e541fc4c..5bc18f9ff 100644 --- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp @@ -34,7 +34,7 @@ DepthwiseConvBufExecution::DepthwiseConvBufExecution(const std::vector const float* filterDataPtr = nullptr; int filterDataSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize); mResource->mFilter.reset(Tensor::createDevice({1, ROUND_UP(filterImageShape[1], 2)/*for kernel C8 read*/, 1, 4 * filterImageShape[0]})); std::shared_ptr filterBuffer(Tensor::createDevice(filterShape)); diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp index 6b62c1286..90bcb5c36 100644 --- a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp @@ -40,7 +40,7 @@ DepthwiseConvSubgroupBufExecution::DepthwiseConvSubgroupBufExecution(const std:: const float *filterDataPtr = nullptr; int filterDataSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize); if (filterDataPtr != nullptr) { std::shared_ptr sourceWeight(Tensor::create( std::vector{1, outputChannel, kernelWidth, kernelHeight}, @@ -112,7 +112,7 @@ DepthwiseConvSubgroupBufExecution::DepthwiseConvSubgroupBufExecution(const std:: } mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL); } - + if (mResource->mConv2dCommonParams->relu() == true) { mResource->mBuildOptions.emplace("-DRELU"); } else if (mResource->mConv2dCommonParams->relu6() == true) { @@ -178,7 +178,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vectormConv2dCommonParams); mPaddings[0] = padding.second;//padY mPaddings[1] = padding.first;//padX - + const int outputHeight = outputShape.at(1); const int outputWidth = outputShape.at(2); const int outputChannel = outputShape.at(3); @@ -190,7 +190,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vectormConv2dParams->common()->kernelY(); const int filterWidth = mResource->mConv2dParams->common()->kernelX(); - + int inputImageShape[2] = {inputHeight, inputWidth}; int outputImageShape[2] = {outputHeight, outputWidth}; int strideShape[2] = {mResource->mStrides[0], mResource->mStrides[1]}; @@ -273,7 +273,7 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vectorget().setArg(idx++, static_cast(outputpad.right)); unit.kernel->get().setArg(idx++, static_cast(paddingShape[1])); unit.kernel->get().setArg(idx++, static_cast(paddingShape[0])); - + mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; diff --git a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp index 03589bc6b..92485742b 100644 --- a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp @@ -81,6 +81,9 @@ GroupNormBufExecution::GroupNormBufExecution(const MNN::Op* op, Backend* backend } else { MNN_ERROR("GroupNorm Beta map error:%d\n", res); } + + mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(gammaBuffer, GammaPtrCL); + mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(betaBuffer, BetaPtrCL); } } diff --git a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp index 0d0e645db..ea055eb37 100644 --- a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp @@ -58,7 +58,7 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector &inputs, cons bool canUseTile = (M % tileM == 0) && \ (N % tileN == 0) && \ (K % tileK == 0); - bool canUseLargeTile = canUseTile && mTransposeA && !mTransposeB && inputs.size() == 2; + bool canUseLargeTile = canUseTile && mTransposeA && !mTransposeB; if (!canUseLargeTile) { // set small tile tileM = 64; @@ -72,16 +72,41 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector &inputs, cons if(canUseLargeTile) { // Match with Large tileM->MWG tileN->NWG tileK->KWG localM->MDIMA localN->NDIMC - buildOptions.emplace(" -DGEMMK=0 -DKREG=1 -DKWG=32 -DKWI=2 -DMDIMA=32 -DMDIMC=32 -DMWG=128 -DNDIMB=8 -DNDIMC=8 -DNWG=128 -DSA=0 -DSB=0 -DSTRM=0 -DSTRN=1 -DVWM=2 -DVWN=8 -DOUTPUTMN"); + uint32_t layout = 4; + uint32_t batch = 1; + std::vector param; + if(inputs.size() == 2) { + param = getGemmParams({(uint32_t)M, (uint32_t)N, (uint32_t)K, layout, batch, (uint32_t)0}, {openCLBuffer(input0), openCLBuffer(input1), openCLBuffer(output)}, mOpenCLBackend->getOpenCLRuntime()); + } else { + param = getGemmParams({(uint32_t)M, (uint32_t)N, (uint32_t)K, layout, batch, (uint32_t)1}, {openCLBuffer(input0), openCLBuffer(input1), openCLBuffer(output), openCLBuffer(inputs[2])}, mOpenCLBackend->getOpenCLRuntime()); + } + int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13]; + buildOptions.emplace("-DKWG=" + std::to_string(KWG)); + buildOptions.emplace("-DKWI=" + std::to_string(KWI)); + buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA)); + buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC)); + buildOptions.emplace("-DMWG=" + std::to_string(MWG)); + buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB)); + buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC)); + buildOptions.emplace("-DNWG=" + std::to_string(NWG)); + buildOptions.emplace("-DSA=" + std::to_string(SA)); + buildOptions.emplace("-DSB=" + std::to_string(SB)); + buildOptions.emplace("-DSTRM=" + std::to_string(STRM)); + buildOptions.emplace("-DSTRN=" + std::to_string(STRN)); + buildOptions.emplace("-DVWM=" + std::to_string(VWM)); + buildOptions.emplace("-DVWN=" + std::to_string(VWN)); + if(layout >= 4) { + buildOptions.emplace("-DOUTPUTMN"); + } + + if(inputs.size() > 2) { + buildOptions.emplace(" -DBIAS_TYPE=1"); + } if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) { buildOptions.emplace("-DUSE_CL_MAD=1"); buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1"); } - if(runtime->isSupportedFP16()){ - buildOptions.emplace(" -DPRECISION=16"); - } else { - buildOptions.emplace(" -DPRECISION=32"); - } + unit.kernel = runtime->buildKernel("matmul_params_buf", "Xgemm", buildOptions); } else if(canUseTile) { @@ -117,7 +142,9 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector &inputs, cons float alpha = 1.0; float beta = 0.0f; - int offset = 0; + int offset[4] = {0, 0, 0, 0}; + int stride[4] = {M, N, N, N}; + int idx = 0; ret |= unit.kernel->get().setArg(idx++, static_cast(M)); ret |= unit.kernel->get().setArg(idx++, static_cast(N)); @@ -131,8 +158,7 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector &inputs, cons } ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output)); ret |= unit.kernel->get().setArg(idx++, offset); - ret |= unit.kernel->get().setArg(idx++, offset); - ret |= unit.kernel->get().setArg(idx++, offset); + ret |= unit.kernel->get().setArg(idx++, stride); MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution use large tile opt"); diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp new file mode 100644 index 000000000..ff1bddda1 --- /dev/null +++ b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp @@ -0,0 +1,470 @@ +// +// StrassenMatmulComputor.cpp +// MNN +// +// Created by MNN on 2024/08/01. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifndef MNN_OPENCL_BUFFER_CLOSED +#include "backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp" +#include "core/TensorUtils.hpp" +//#define MNN_OPEN_TIME_TRACE +#include + +namespace MNN { +namespace OpenCL { + +class AutoMemory { +public: + AutoMemory(int size, OpenCLBackend* backend) { + mOpenCLBackend = backend; + mTempTensor.reset(Tensor::createDevice({size})); + bool res = mOpenCLBackend->onAcquireBuffer(mTempTensor.get(), Backend::DYNAMIC); + if (!res) { + MNN_ERROR("Strassen out of memory\n"); + } + mAddrPtr = openCLBuffer(mTempTensor.get()); + } + ~ AutoMemory() { + mOpenCLBackend->onReleaseBuffer(mTempTensor.get(), Backend::DYNAMIC); + } + const cl::Buffer& get() const { + return mAddrPtr; + } +private: + cl::Buffer mAddrPtr; + OpenCLBackend* mOpenCLBackend; + std::shared_ptr mTempTensor; +}; + +StrassenMatrixComputor::StrassenMatrixComputor(Backend* bn, int maxDepth) { + mMaxDepth = maxDepth; + mOpenCLBackend = static_cast(bn); + mBytes = (mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16() ? 2 : 4); + onReset(); +}; +StrassenMatrixComputor::~StrassenMatrixComputor() { + // Do nothing +} + +ErrorCode StrassenMatrixComputor::_generateCFunction(cl::Buffer ptrC, int offsetC, int elementStrideC, cl::Buffer ptrA, int width, int height, Unit& unit) { + std::set buildOptions; + int vec_h = 1; + buildOptions.emplace("-DVEC_H=" + std::to_string(vec_h)); + unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("strassen_binary_buf", "binary_cfunction_buf", buildOptions); + auto maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel)); + + std::vector globalWorkSize = {(uint32_t)UP_DIV(width, 8), (uint32_t)UP_DIV(height, vec_h)}; + + uint32_t index = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(index++, globalWorkSize[0]); + ret |= unit.kernel->get().setArg(index++, globalWorkSize[1]); + ret |= unit.kernel->get().setArg(index++, ptrC); + ret |= unit.kernel->get().setArg(index++, offsetC); + ret |= unit.kernel->get().setArg(index++, elementStrideC); + ret |= unit.kernel->get().setArg(index++, ptrA); + ret |= unit.kernel->get().setArg(index++, ptrC); + ret |= unit.kernel->get().setArg(index++, width); + ret |= unit.kernel->get().setArg(index++, height); + + MNN_CHECK_CL_SUCCESS(ret, "Strassen setArg BinaryCFunctionExecution"); + + std::string name = "binary_cfunction_buf"; + auto localWorkSize = localWS2DDefault(globalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first; + + globalWorkSize[0] = ROUND_UP(globalWorkSize[0], std::max((uint32_t)1, localWorkSize[0])); + globalWorkSize[1] = ROUND_UP(globalWorkSize[1], std::max((uint32_t)1, localWorkSize[1])); + + unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]}; + unit.localWorkSize = {localWorkSize[0], localWorkSize[1]}; + mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize); + return NO_ERROR; + +} + +ErrorCode StrassenMatrixComputor::_generateBinary(cl::Buffer ptrC, cl::Buffer ptrA, cl::Buffer ptrB, int offsetC, int offsetA, int offsetB, int elementStrideC, int elementStrideA, int elementStrideB, int width, int height, bool isAdd, Unit& unit) { + std::set buildOptions; + if(isAdd) { + buildOptions.emplace("-DOPERATOR=in0+in1"); + } else { + buildOptions.emplace("-DOPERATOR=in0-in1"); + } + int vec_h = 1; + buildOptions.emplace("-DVEC_H=" + std::to_string(vec_h)); + unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("strassen_binary_buf", "binary_function_buf", buildOptions); + auto maxWorkGroupSize = static_cast(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel)); + + std::vector globalWorkSize = {(uint32_t)UP_DIV(width, 8), (uint32_t)UP_DIV(height, vec_h)}; + int baseOffset[4] = {offsetA, offsetB, offsetC, 0}; + int elementStride[4] = {elementStrideA, elementStrideB, elementStrideC, 0}; + + uint32_t index = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(index++, globalWorkSize[0]); + ret |= unit.kernel->get().setArg(index++, globalWorkSize[1]); + ret |= unit.kernel->get().setArg(index++, ptrA); + ret |= unit.kernel->get().setArg(index++, ptrB); + ret |= unit.kernel->get().setArg(index++, ptrC); + ret |= unit.kernel->get().setArg(index++, baseOffset); + ret |= unit.kernel->get().setArg(index++, elementStride); + + MNN_CHECK_CL_SUCCESS(ret, "Strassen setArg BinaryExecution"); + + std::string name = "binary_function_buf"; + auto localWorkSize = localWS2DDefault(globalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first; + + globalWorkSize[0] = ROUND_UP(globalWorkSize[0], std::max((uint32_t)1, localWorkSize[0])); + globalWorkSize[1] = ROUND_UP(globalWorkSize[1], std::max((uint32_t)1, localWorkSize[1])); + + unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]}; + unit.localWorkSize = {localWorkSize[0], localWorkSize[1]}; + mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize); + return NO_ERROR; +} + +ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int postType, Unit& unit) { + + std::set buildOptions; + + uint32_t layout = 4; + uint32_t batch = 1; + + std::vector param; + if(COT.stackIndex < 0 || postType == 0) { + param = getGemmParams({(uint32_t)e, (uint32_t)h, (uint32_t)l, layout, batch, (uint32_t)0}, {mStack[AT.stackIndex], mStack[BT.stackIndex], mStack[CT.stackIndex]}, mOpenCLBackend->getOpenCLRuntime()); + } else { + param = getGemmParams({(uint32_t)e, (uint32_t)h, (uint32_t)l, layout, batch, (uint32_t)postType}, {mStack[AT.stackIndex], mStack[BT.stackIndex], mStack[CT.stackIndex], mStack[COT.stackIndex]}, mOpenCLBackend->getOpenCLRuntime()); + } + int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13]; + buildOptions.emplace("-DKWG=" + std::to_string(KWG)); + buildOptions.emplace("-DKWI=" + std::to_string(KWI)); + buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA)); + buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC)); + buildOptions.emplace("-DMWG=" + std::to_string(MWG)); + buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB)); + buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC)); + buildOptions.emplace("-DNWG=" + std::to_string(NWG)); + buildOptions.emplace("-DSA=" + std::to_string(SA)); + buildOptions.emplace("-DSB=" + std::to_string(SB)); + buildOptions.emplace("-DSTRM=" + std::to_string(STRM)); + buildOptions.emplace("-DSTRN=" + std::to_string(STRN)); + buildOptions.emplace("-DVWM=" + std::to_string(VWM)); + buildOptions.emplace("-DVWN=" + std::to_string(VWN)); + if(layout >= 4) { + buildOptions.emplace("-DOUTPUTMN"); + } + + if(postType > 0) { + buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string(postType)); + } + + int tileM = MWG; + int tileN = NWG; + int localM = MDIMC; + int localN = NDIMC; + int alignM = e; + int alignN = h; + int alignK = l; + if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) { + buildOptions.emplace("-DUSE_CL_MAD=1"); + buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1"); + } + + unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "Xgemm", buildOptions); + + int out_per_thread_m = tileM / localM; + int out_per_thread_n = tileN / localN; + + std::vector globalWorkSize = {static_cast(alignM/out_per_thread_m), static_cast(alignN/out_per_thread_n)}; + std::vector localWorkSize = {static_cast(localM), static_cast(localN)}; + + float alpha = 1.0; + float beta = 0.0f; + // offset_a, offset_b, offset_c, offset_bias + int offset[4] = {AT.offsetBytes / mBytes, BT.offsetBytes / mBytes, CT.offsetBytes / mBytes, COT.offsetBytes / mBytes}; + // stride_a, stride_b, stride_c, stride_bias + int stride[4] = {AT.lineStrideBytes / mBytes, BT.lineStrideBytes / mBytes, CT.lineStrideBytes / mBytes, COT.lineStrideBytes / mBytes}; + + int idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, static_cast(alignM)); + ret |= unit.kernel->get().setArg(idx++, static_cast(alignN)); + ret |= unit.kernel->get().setArg(idx++, static_cast(alignK)); + ret |= unit.kernel->get().setArg(idx++, alpha); + ret |= unit.kernel->get().setArg(idx++, beta); + ret |= unit.kernel->get().setArg(idx++, mStack[AT.stackIndex]); + ret |= unit.kernel->get().setArg(idx++, mStack[BT.stackIndex]); + if(postType > 0) { + ret |= unit.kernel->get().setArg(idx++, mStack[COT.stackIndex]); + } + ret |= unit.kernel->get().setArg(idx++, mStack[CT.stackIndex]); + ret |= unit.kernel->get().setArg(idx++, offset); + ret |= unit.kernel->get().setArg(idx++, stride); + + MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf Strassen Kernel Select"); + + unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1]}; + unit.localWorkSize = {localWorkSize[0], localWorkSize[1]}; + mOpenCLBackend->recordKernel2d(unit.kernel, globalWorkSize, localWorkSize); + + return NO_ERROR; +} + +ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int currentDepth, int postType) { + + bool isAligned = (e % 32 == 0 && l % 4 == 0 && h % 32 == 0); + bool enoughComputation = (e >= 512 && l >= 512 && h >= 512) && (1.0 * e / 1024 * l / 1024 * h / 1024 >= 4.0); + + if (currentDepth >= mMaxDepth || !isAligned || !enoughComputation) {// not align or not enough computation + Unit unit; + auto res = _generateBasicMatMul(e, l, h, AT, BT, CT, COT, postType, unit); + mUnits.emplace_back(unit); + return res; + } + int eSub = e / 2; + int hSub = h / 2; + int lSub = l / 2; + + // Compute expand the memory read and write cost + float AComputeCost = 1.0 * eSub * lSub * 12 * mBytes;// 4 times, 3 matrix each time + float BComputeCost = 1.0 * lSub * hSub * 12 * mBytes;// 4 times, 3 matrix each time + float CComputeCost = 1.0 * eSub * hSub * (8 + 3 * 2) * mBytes;// 3 times, 8 matrix first time, 3 matrix last two times + // Compute save compute time + float saveMatMulCost = 1.0 * eSub * lSub * hSub * 2;// 2 for Mul_ADD + + // devices peak compute value / memory bandwidth + const float penalty = 30.0;//FIXME: Find beter way to set it + float saveCost = saveMatMulCost - (AComputeCost + BComputeCost + CComputeCost) * penalty; + + if (saveCost <= 0.0f) { + Unit unit; + auto res = _generateBasicMatMul(e, l, h, AT, BT, CT, COT, postType, unit); + mUnits.emplace_back(unit); + return res; + } + + // Strassen Construct + currentDepth += 1; + + auto maxlH = std::max(lSub, hSub); + + AutoMemory YAddr(hSub * lSub, mOpenCLBackend); + AutoMemory XAddr(maxlH * eSub, mOpenCLBackend); + + MatrixInfo Y; + Y.stackIndex = (int)mStack.size(); + mStack.emplace_back(YAddr.get()); + Y.offsetBytes = 0; + Y.lineStrideBytes = hSub * mBytes; + MatrixInfo X; + X.stackIndex = (int)mStack.size(); + X.offsetBytes = 0; + X.lineStrideBytes = eSub * mBytes; + mStack.emplace_back(XAddr.get()); + + MatrixInfo CX; + CX.stackIndex = X.stackIndex; + CX.offsetBytes = 0; + CX.lineStrideBytes = hSub * mBytes; + + MatrixInfo a11 = AT; + MatrixInfo a12 = AT; + a12.offsetBytes = AT.offsetBytes + AT.lineStrideBytes * lSub; + MatrixInfo a21 = AT; + a21.offsetBytes = AT.offsetBytes + eSub * mBytes; + MatrixInfo a22 = AT; + a22.offsetBytes = AT.offsetBytes + eSub * mBytes + AT.lineStrideBytes * lSub; + + MatrixInfo b11 = BT; + MatrixInfo b12 = BT; + b12.offsetBytes = BT.offsetBytes + hSub * mBytes; + MatrixInfo b21 = BT; + b21.offsetBytes = BT.offsetBytes + BT.lineStrideBytes * lSub; + MatrixInfo b22 = BT; + b22.offsetBytes = BT.offsetBytes + BT.lineStrideBytes * lSub + hSub * mBytes; + + MatrixInfo c11 = CT; + MatrixInfo c12 = CT; + c12.offsetBytes = CT.offsetBytes + hSub * mBytes; + MatrixInfo c21 = CT; + c21.offsetBytes = CT.offsetBytes + CT.lineStrideBytes * eSub; + MatrixInfo c22 = CT; + c22.offsetBytes = CT.offsetBytes + CT.lineStrideBytes * eSub + hSub * mBytes; + + MatrixInfo Empty; + Empty.stackIndex = -1; + + { + // S3=A11-A21, T3=B22-B12, P7=S3*T3 + { + Unit unit; + _generateBinary(mStack[X.stackIndex], mStack[a11.stackIndex], mStack[a21.stackIndex], X.offsetBytes/mBytes, a11.offsetBytes/mBytes, a21.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a11.lineStrideBytes/mBytes, a21.lineStrideBytes/mBytes, eSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + { + Unit unit; + _generateBinary(mStack[Y.stackIndex], mStack[b22.stackIndex], mStack[b12.stackIndex], Y.offsetBytes/mBytes, b22.offsetBytes/mBytes, b12.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b22.lineStrideBytes/mBytes, b12.lineStrideBytes/mBytes, hSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + + auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + } + { + // S1=A21+A22, T1=B12-B11, P5=S1T1 + { + Unit unit; + _generateBinary(mStack[X.stackIndex], mStack[a21.stackIndex], mStack[a22.stackIndex], X.offsetBytes/mBytes, a21.offsetBytes/mBytes, a22.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a21.lineStrideBytes/mBytes, a22.lineStrideBytes/mBytes, eSub, lSub, true, unit); + mUnits.emplace_back(unit); + } + { + Unit unit; + _generateBinary(mStack[Y.stackIndex], mStack[b12.stackIndex], mStack[b11.stackIndex], Y.offsetBytes/mBytes, b12.offsetBytes/mBytes, b11.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b12.lineStrideBytes/mBytes, b11.lineStrideBytes/mBytes, hSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + + auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + } + { + // S2=S1-A11, T2=B22-T1, P6=S2T2 + { + Unit unit; + _generateBinary(mStack[X.stackIndex], mStack[X.stackIndex], mStack[a11.stackIndex], X.offsetBytes/mBytes, X.offsetBytes/mBytes, a11.offsetBytes/mBytes, X.lineStrideBytes/mBytes, X.lineStrideBytes/mBytes, a11.lineStrideBytes/mBytes, eSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + { + Unit unit; + _generateBinary(mStack[Y.stackIndex], mStack[b22.stackIndex], mStack[Y.stackIndex], Y.offsetBytes/mBytes, b22.offsetBytes/mBytes, Y.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, b22.lineStrideBytes/mBytes, Y.lineStrideBytes/mBytes, hSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + + auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + } + { + // S4=A12-S2, P3=S4*B22, P1=A11*B11 + { + Unit unit; + _generateBinary(mStack[X.stackIndex], mStack[a12.stackIndex], mStack[X.stackIndex], X.offsetBytes/mBytes, a12.offsetBytes/mBytes, X.offsetBytes/mBytes, X.lineStrideBytes/mBytes, a12.lineStrideBytes/mBytes, X.lineStrideBytes/mBytes, eSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + + auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + code = _generateMatMul(eSub, lSub, hSub, a11, b11, CX, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + } + { + // U2=P1+P6, U3=U2+P7, U4=U2+P5, U7=U3+P5 + // U5=U4+P3, T4=T2-B21, P4=A22*T4 + { + Unit unit; + _generateCFunction(mStack[CT.stackIndex], CT.offsetBytes/mBytes, CT.lineStrideBytes/mBytes, mStack[CX.stackIndex], hSub, eSub, unit); + mUnits.emplace_back(unit); + } + + { + Unit unit; + _generateBinary(mStack[Y.stackIndex], mStack[Y.stackIndex], mStack[b21.stackIndex], Y.offsetBytes/mBytes, Y.offsetBytes/mBytes, b21.offsetBytes/mBytes, Y.lineStrideBytes/mBytes, Y.lineStrideBytes/mBytes, b21.lineStrideBytes/mBytes, hSub, lSub, false, unit); + mUnits.emplace_back(unit); + } + } + { + auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + // U6=U3-P4, P2=A12*B21, U1=P1+P2 + { + Unit unit; + _generateBinary(mStack[c21.stackIndex], mStack[c21.stackIndex], mStack[c11.stackIndex], c21.offsetBytes/mBytes, c21.offsetBytes/mBytes, c11.offsetBytes/mBytes, c21.lineStrideBytes/mBytes, c21.lineStrideBytes/mBytes, c11.lineStrideBytes/mBytes, hSub, eSub, false, unit); + mUnits.emplace_back(unit); + } + + { + auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, 0); + if (code != NO_ERROR) { + return code; + } + Unit unit; + _generateBinary(mStack[c11.stackIndex], mStack[c11.stackIndex], mStack[CX.stackIndex], c11.offsetBytes/mBytes, c11.offsetBytes/mBytes, CX.offsetBytes/mBytes, c11.lineStrideBytes/mBytes, c11.lineStrideBytes/mBytes, CX.lineStrideBytes/mBytes, hSub, eSub, true, unit); + mUnits.emplace_back(unit); + } + + } + return NO_ERROR; +} + +void StrassenMatrixComputor::onReset() { + mStack.clear(); + mUnits.clear(); +} + +ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const cl::Buffer AT, const cl::Buffer BT, cl::Buffer CT, bool useBias, const cl::Buffer Bias) { + mM = e; + mN = h; + mK = l; + MatrixInfo a,b,c,bias; + bias.stackIndex = -1; + mUnits.clear(); + mStack = {AT, BT, CT}; + if (useBias) { + bias.stackIndex = 3; + bias.offsetBytes = 0; + mStack.emplace_back(Bias); + } + a.stackIndex = 0; + a.lineStrideBytes = as * mBytes; + a.offsetBytes = 0; + + b.stackIndex = 1; + b.lineStrideBytes = bs * mBytes; + b.offsetBytes = 0; + + c.stackIndex = 2; + c.lineStrideBytes = cs * mBytes; + c.offsetBytes = 0; + return _generateMatMul(e, l, h, a, b, c, bias, 0, useBias); +} + +void StrassenMatrixComputor::onExecute() { + // All is done in onResize, just execute it + auto res = CL_SUCCESS; + int count = 0; + for (auto &unit : mUnits) { + if(unit.localWorkSize[0] == 0 || unit.localWorkSize[1] == 0) { + unit.localWorkSize = cl::NullRange; + } +#ifdef ENABLE_OPENCL_TIME_PROFILER + cl::Event event; + res = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueNDRangeKernel(unit.kernel->get(), + cl::NullRange, + unit.globalWorkSize, + unit.localWorkSize, + nullptr, + &event); + mOpenCLBackend->getOpenCLRuntime()->pushEvent({"Strassen-" + std::to_string(count++) + "-m" + std::to_string(mM) + "-n" + std::to_string(mN) + "-k" + std::to_string(mK), event}); +#else + res = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueNDRangeKernel(unit.kernel->get(), + cl::NullRange, + unit.globalWorkSize, + unit.localWorkSize); +#endif + MNN_CHECK_CL_SUCCESS(res, "Strassen execute"); + } +} +} // namespace MNN +} +#endif diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp new file mode 100644 index 000000000..dc6a9fa7a --- /dev/null +++ b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.hpp @@ -0,0 +1,67 @@ +// +// StrassenMatmulComputor.hpp +// MNN +// +// Created by MNN on 2024/08/01. +// Copyright © 2018, Alibaba Group Holding Limited +// +#ifndef MNN_OPENCL_BUFFER_CLOSED + +#ifndef StrassenMatmulOpenCLComputor_hpp +#define StrassenMatmulOpenCLComputor_hpp + +#include "core/BufferAllocator.hpp" +#include "core/Backend.hpp" +#include "backend/opencl/execution/image/CommonExecution.hpp" + +namespace MNN { +namespace OpenCL { +/** + Based on + Boyer, B., Dumas, J.-G., Pernet, C., & Zhou, W. (2007). Memory efficient scheduling of Strassen-Winogradʼs matrix multiplication algorithm. Proceedings of the 2009 international symposium on Symbolic and algebraic computation ISSAC 09, 55. ACM Press. Retrieved from http://arxiv.org/abs/0707.2347 + + Use Table 2 + */ +class StrassenMatrixComputor { +public: + StrassenMatrixComputor(Backend* bn, int maxDepth); + virtual ~StrassenMatrixComputor(); + + ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const cl::Buffer AT, const cl::Buffer BT, cl::Buffer CT, bool useBias, const cl::Buffer Bias); + + void onExecute(); + + void onReset(); +private: + struct MatrixInfo { + int stackIndex; + int offsetBytes; + int lineStrideBytes; + }; + + /* postType: + 0 --> without post process + 1 --> with bias (one dimension) + 2 --> with feature map D to eltwise add ( Y = X + D) + 3 --> with feature map D to eltwise sub ( Y = X - D) + 4 --> with feature map D to eltwise sub and get negative( Y = D - X) + */ + ErrorCode _generateMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int currentDepth, int postType = 0); + ErrorCode _generateBasicMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int postType, Unit& unit); + + ErrorCode _generateBinary(cl::Buffer ptrC, cl::Buffer ptrA, cl::Buffer ptrB, int offsetC, int offsetA, int offsetB, int elementStrideC, int elementStrideA, int elementStrideB, int width, int height, bool isAdd, Unit& unit); + + ErrorCode _generateCFunction(cl::Buffer ptrC, int offsetC, int elementStrideC, cl::Buffer ptrA, int width, int height, Unit& unit); + +private: + std::vector mUnits; + int mMaxDepth; + OpenCLBackend* mOpenCLBackend; + int mM, mN, mK; + std::vector mStack; + int mBytes = 4; +}; +} // namespace MNN +} +#endif /* StrassenMatmulOpenCLComputor_hpp */ +#endif diff --git a/source/backend/opencl/execution/cl/buffer_convert_quant.cl b/source/backend/opencl/execution/cl/buffer_convert_quant.cl index 1215cc71b..5043e1418 100644 --- a/source/backend/opencl/execution/cl/buffer_convert_quant.cl +++ b/source/backend/opencl/execution/cl/buffer_convert_quant.cl @@ -20,18 +20,18 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS __global char *output) { int image_width_idx = get_global_id(0); // ic int image_height_idx = get_global_id(1); // oc/4 h w - + DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx); - + const int input_channel_4_idx = image_width_idx; const int output_channel_4_idx = (image_height_idx / height_width_size) * 4; const int height_width_idx = image_height_idx % height_width_size; const int buffer_height_idx = height_width_idx / kernel_shape.y; const int buffer_width_idx = height_width_idx % kernel_shape.y; - + const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size + buffer_height_idx * kernel_shape.y + buffer_width_idx; - + char4 output_values = 0; if (output_channel_4_idx < output_channel) { const int remain_channel = output_channel - output_channel_4_idx; @@ -51,7 +51,7 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS output_values.y = (char)(*(input_ptr + offset)); offset += ic_h_w_size; output_values.z = (char)(*(input_ptr + offset)); - + } else if (remain_channel == 2) { int offset = buffer_offset; output_values.x = (char)(*(input_ptr + offset)); @@ -70,7 +70,7 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int8(GLOBAL_SIZE_2_DIMS #ifdef USE_LOW_BIT_WEIGHT_INT4 // convert kernel : from int8 buffer(oihw) to int4 image(oc/4 h w , ic oc4) __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS - __global const char *input_ptr, + __global const uchar *input_ptr, __private const int output_channel, __private const int2 kernel_shape, __private const int ic_h_w_size, @@ -78,53 +78,26 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS __global uchar *output) { int image_width_idx = get_global_id(0); // ic int image_height_idx = get_global_id(1); // oc/4 h w - + DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx); - + const int input_channel_4_idx = image_width_idx; const int output_channel_4_idx = (image_height_idx / height_width_size) * 4; const int height_width_idx = image_height_idx % height_width_size; const int buffer_height_idx = height_width_idx / kernel_shape.y; const int buffer_width_idx = height_width_idx % kernel_shape.y; - - const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size + - buffer_height_idx * kernel_shape.y + buffer_width_idx; - - char4 output_values_int8 = 0; - if (output_channel_4_idx < output_channel) { - const int remain_channel = output_channel - output_channel_4_idx; - if (remain_channel >= 4) { - int offset = buffer_offset; - output_values_int8.x = (char)(*(input_ptr + offset)); - offset = mad24(1, ic_h_w_size, offset); - output_values_int8.y = (char)(*(input_ptr + offset)); - offset += ic_h_w_size; - output_values_int8.z = (char)(*(input_ptr + offset)); - offset += ic_h_w_size; - output_values_int8.w = (char)(*(input_ptr + offset)); - } else if (remain_channel == 3) { - int offset = buffer_offset; - output_values_int8.x = (char)(*(input_ptr + offset)); - offset = mad24(1, ic_h_w_size, offset); - output_values_int8.y = (char)(*(input_ptr + offset)); - offset += ic_h_w_size; - output_values_int8.z = (char)(*(input_ptr + offset)); - - } else if (remain_channel == 2) { - int offset = buffer_offset; - output_values_int8.x = (char)(*(input_ptr + offset)); - offset = mad24(1, ic_h_w_size, offset); - output_values_int8.y = (char)(*(input_ptr + offset)); - } else if (remain_channel == 1) { - int offset = buffer_offset; - output_values_int8.x = (char)(*(input_ptr + offset)); - } - } - + + const int buffer_offset = output_channel_4_idx * ic_h_w_size + input_channel_4_idx * height_width_size + buffer_height_idx * kernel_shape.y + buffer_width_idx; + int index0 = buffer_offset, index1 = buffer_offset + ic_h_w_size, index2 = buffer_offset + 2 * ic_h_w_size, index3 = buffer_offset + 3 * ic_h_w_size; + uchar2 output_values_int4 = (uchar2)(0, 0); - output_values_int4.s0 = (output_values_int8.x + 8) * 16 + (output_values_int8.y + 8); - output_values_int4.s1 = (output_values_int8.z + 8) * 16 + (output_values_int8.w + 8); - + uchar s0 = input_ptr[index0/2]; + uchar s1 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index1/2]; + uchar s2 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index2/2]; + uchar s3 = output_channel_4_idx + 1 >= output_channel ? 0 : input_ptr[index3/2]; + output_values_int4.x = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f)); + output_values_int4.y = ((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f)); + const int out_offset = (image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*2; vstore2(output_values_int4, 0, output+out_offset); } @@ -137,106 +110,134 @@ __kernel void conv2d_filter_buffer_to_nc4hw4_buffer_int4(GLOBAL_SIZE_2_DIMS a = (uchar16)(((b.s0 + 8) << 4) + b.s1 + 8, ((b.s2 + 8) << 4) + b.s3 + 8, ((b.s4 + 8) << 4) + b.s5 + 8, ((b.s6 + 8) << 4) + b.s7 + 8, ((b.s8 + 8) << 4) + b.s9 + 8, ((b.sa + 8) << 4) + b.sb + 8, ((b.sc + 8) << 4) + b.sd + 8, ((b.se + 8) << 4) + b.sf + 8, \ ((c.s0 + 8) << 4) + c.s1 + 8, ((c.s2 + 8) << 4) + c.s3 + 8, ((c.s4 + 8) << 4) + c.s5 + 8, ((c.s6 + 8) << 4) + c.s7 + 8, ((c.s8 + 8) << 4) + c.s9 + 8, ((c.sa + 8) << 4) + c.sb + 8, ((c.sc + 8) << 4) + c.sd + 8, ((c.se + 8) << 4) + c.sf + 8); __kernel void conv2d_1x1_weight_quant_buffer(GLOBAL_SIZE_2_DIMS - __global const char *input_ptr, #ifdef USE_LOW_BIT_WEIGHT_INT4 - __global uchar *output_ptr, + __global const uchar *input_ptr, #else - __global char *output_ptr, + __global const char *input_ptr, #endif + __global char *output_ptr, __private const int input_channel, __private const int output_channel) { int x = get_global_id(0); // ic / 16 int y = get_global_id(1); // oc - + DEAL_NON_UNIFORM_DIM2(x, y); const int xin = x << 4; const int outputChannelC4 = (output_channel + 3) >> 2; - const int inputOffset = y * input_channel + xin; - char16 weight = 0; -#ifdef INPUT_CHANNEL_LEAVE - if(xin + 15 >= input_channel){ - char *weight_ptr = (char*)&weight; - for(int i = 0, j = 0; xin + i < input_channel && j < 16; ++i, ++j){ - weight_ptr[j] = input_ptr[inputOffset + i]; - } - }else { - weight = vload16(0, input_ptr + inputOffset); +#ifdef USE_LOW_BIT_WEIGHT_INT4 + const int outputOffset = ((x * outputChannelC4 * 4 * 8 + y * 8)); +#ifdef CHANNEL_LEAVE + for(int i = 0; i < 8; ++i){ + int index0 = y * input_channel + xin + i * 2; + int index1 = y * input_channel + xin + i * 2 + 1; + uchar s0 = input_ptr[index0/2]; + uchar s1 = input_ptr[index1/2]; + output_ptr[outputOffset + i] = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f)); } #else - weight = vload16(0, input_ptr + inputOffset); + const int inputOffset = (y * input_channel + xin)/2; + vstore8(convert_char8(vload8(0,input_ptr+inputOffset)),0,output_ptr+outputOffset); #endif - -#ifdef USE_LOW_BIT_WEIGHT_INT4 - const int outputOffset = ((x * outputChannelC4 * 4 * 8 + y * 8)); - uchar8 outWeight; - CHAR16_TO_UCHAR8(outWeight, weight); - vstore8(outWeight, 0, output_ptr + outputOffset); #else + const int inputOffset = y * input_channel + xin; const int outputOffset = (x * outputChannelC4 * 4 + y) << 4; - vstore16(weight, 0, output_ptr + outputOffset); + vstore16(convert_char16(vload16(0, input_ptr + inputOffset)), 0, output_ptr + outputOffset); #endif } __kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS - __global const char *input_ptr, +#ifdef USE_LOW_BIT_WEIGHT_INT4 + __global const uchar *input_ptr, +#else + __global const uchar *input_ptr, +#endif __write_only image2d_t output, __private const int input_channel, __private const int output_channel) { - -#ifdef USE_LOW_BIT_WEIGHT_INT4 - int x = get_global_id(0); // ic / 32 + + int x = get_global_id(0); // ic / 16 int y = get_global_id(1); // oc - + DEAL_NON_UNIFORM_DIM2(x, y); - const int outputChannelC4 = (output_channel + 3) >> 2; - const int xin = x << 5; - const int inputOffset = y * input_channel + xin; - char16 weight00 = 0, weight01 = 0; -#ifdef INPUT_CHANNEL_LEAVE - if(xin + 31 >= input_channel){ - char *weight00_ptr = (char*)&weight00; - char *weight01_ptr = (char*)&weight01; - int i = 0; - for(int j = 0; xin + i < input_channel && j < 16; ++i, ++j){ - weight00_ptr[j] = input_ptr[inputOffset + i]; - } - for(int j = 0; xin + i < input_channel && j < 16; ++i, ++j){ - weight01_ptr[j] = input_ptr[inputOffset + i]; - } - }else { - weight00 = vload16(0, input_ptr + inputOffset); - weight01 = vload16(0, input_ptr + inputOffset + 16); + const int xin = x << 4; +#ifdef USE_LOW_BIT_WEIGHT_INT4 +#ifdef CHANNEL_LEAVE + uchar8 out = 0; + uchar *out_ptr = (uchar*)&out; + for(int i = 0; i < 8; ++i){ + int index0 = y * input_channel + xin + i * 2; + int index1 = y * input_channel + xin + i * 2 + 1; + uchar s0 = input_ptr[index0/2]; + uchar s1 = input_ptr[index1/2]; + out_ptr[i] = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f)); } + write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(out))); #else - weight00 = vload16(0, input_ptr + inputOffset); - weight01 = vload16(0, input_ptr + inputOffset + 16); + const int inputOffset = (y * input_channel + xin)/2; + write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(vload8(0, input_ptr + inputOffset)))); #endif - - uchar16 outWeight; - CHAR32_TO_UCHAR16(outWeight, weight00, weight01); - write_imagei(output, (int2)(y, x), as_int4(outWeight)); #else - int x = get_global_id(0); // ic / 16 - int y = get_global_id(1); // oc - - DEAL_NON_UNIFORM_DIM2(x, y); - const int xin = x << 4; const int inputOffset = y * input_channel + xin; - const int outputChannelC4 = (output_channel + 3) >> 2; - char16 weight = 0; -#ifdef INPUT_CHANNEL_LEAVE - if(xin + 15 >= input_channel){ - char *weight_ptr = (char*)&weight; - for(int i = 0, j = 0; xin + i < input_channel && j < 16; ++i, ++j){ - weight_ptr[j] = input_ptr[inputOffset + i]; + write_imagei(output, (int2)(y, x), as_int4(vload16(0, input_ptr + inputOffset))); +#endif +} + +__kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS +#ifdef USE_LOW_BIT_WEIGHT_INT4 + __global const uchar *input_ptr, + __global uchar *output_ptr, //(Ci/packCin, Co/packCout, packCin, packCout) +#else + __global const char *input_ptr, + __global char *output_ptr, //(Ci/packCin, Co/packCout, packCin, packCout) +#endif + __private const int input_channel, + __private const int output_channel, + __private const int icPack, + __private const int ocPack) { + int x = get_global_id(0); // ic / icPack + int y = get_global_id(1); // oc / ocPack + + DEAL_NON_UNIFORM_DIM2(x, y); + const int xin = x * icPack; + const int yin = y * ocPack; + const int inputChannelC4 = (input_channel + icPack - 1) / icPack; + const int outputChannelC4 = (output_channel + ocPack - 1) / ocPack; +#ifdef USE_LOW_BIT_WEIGHT_INT4 + const int inputOffset = (yin * input_channel + xin) / 2; + const int outputOffset = ((x * outputChannelC4 + y) * icPack * ocPack) / 2; +#ifdef CHANNEL_LEAVE + for(int i = 0; i < icPack; ++i){ + for(int j = 0; j < ocPack / 2; ++j){ + int index0 = (yin + j * 2) * input_channel + xin + i; + int index1 = (yin + j * 2 + 1) * input_channel + xin + i; + uchar s0 = input_ptr[index0/2]; + uchar s1 = input_ptr[index1/2]; + s0 = (index0 % 2) == 0 ? (s0 & 0xf0) : ((s0 & 0x0f) << 4); + s1 = (index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f); + output_ptr[outputOffset + i * (ocPack / 2) + j] = s0 | s1; } - }else { - weight = vload16(0, input_ptr + inputOffset); } #else - weight = vload16(0, input_ptr + inputOffset); + for(int i = 0; i < icPack/2; ++i){ + for(int j = 0; j < ocPack / 2; ++j){ + char s0 = input_ptr[inputOffset + (j * 2) * (input_channel / 2) + i]; + char s1 = input_ptr[inputOffset + (j * 2 + 1) * (input_channel / 2) + i]; + char d0 = (s0 & 0xf0) | ((s1 & 0xf0) >> 4); + char d1 = ((s0 & 0x0f) << 4) | (s1 & 0x0f); + output_ptr[outputOffset + (i * 2) * (ocPack / 2) + j] = d0; + output_ptr[outputOffset + (i * 2 + 1) * (ocPack / 2) + j] = d1; + } + } #endif - - write_imagei(output, (int2)(y, x), as_int4(weight)); +#else + const int inputOffset = yin * input_channel + xin; + const int outputOffset = (x * outputChannelC4 + y) * icPack * ocPack; + for(int i = 0; i < icPack; ++i){ + for(int j = 0; j < ocPack; ++j){ + output_ptr[outputOffset + i * ocPack + j] = input_ptr[inputOffset + j * input_channel + i]; + } + } #endif } + + diff --git a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl b/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl index 3fc8fd050..083268503 100644 --- a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl +++ b/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl @@ -481,7 +481,8 @@ __kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2 #endif vstore4(CONVERT_FLOAT4(out), 0, output+out_offset); } -__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2 + +__kernel void gemm_b4_c4_image(GLOBAL_SIZE_DIM2 __global const FLOAT* input, __read_only image2d_t weight, __global const float *dequantScaleOffset, @@ -495,17 +496,18 @@ __kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2 const int y = get_global_id(1); //b UNIFORM_BOUNDRY_CHECK(x, y); - const int out_c_idx = x << 1; + const int out_c_idx = x << 2; const int out_b_idx = y << 2; - COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx)); + COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(0, bias + out_c_idx)); COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0; COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1; + COMPUTE_FLOAT4 out2 = (COMPUTE_FLOAT4)bias0.s2; + COMPUTE_FLOAT4 out3 = (COMPUTE_FLOAT4)bias0.s3; int input_offset = out_b_idx * srcChannelC4 * 4; int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4; -#if (defined USE_LOW_BIT_WEIGHT_INT8) const int loop = (blockDim + 15) / 16; #ifdef INPUT_CHANNEL_LEAVE const int loop_end = max(loop - 1, 0); @@ -513,126 +515,208 @@ __kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2 #else const int loop_end = loop; #endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) - const int loop = (blockDim + 31) / 32; - #ifdef INPUT_CHANNEL_LEAVE - const int loop_end = max(loop - 1, 0); - const int remain = blockDim - loop_end*32; - #else - const int loop_end = loop; - #endif -#endif for (int i = 0; i < blockNum; i++){ int kindex = i * dstChannelC4 * 4 * 2; - COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex)); -#if (defined USE_LOW_BIT_WEIGHT_INT8) + COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + out_c_idx * 2 + kindex)); for (int j = 0; j < loop_end; j++) { int k = i * loop + j; int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5; + COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; + { + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); + uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k)))); + uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k)))); + char16 charWeights0 = 0; + char16 charWeights1 = 0; + char16 charWeights2 = 0; + char16 charWeights3 = 0; + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); + UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42); + UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5; + weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7; + } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; + COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2; + COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3; #pragma unroll for (int i = 0; i < 16; ++i){ COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); out = mad(in, weights0_ptr[i], out); out1 = mad(in, weights1_ptr[i], out1); + out2 = mad(in, weights2_ptr[i], out2); + out3 = mad(in, weights3_ptr[i], out3); } } #ifdef INPUT_CHANNEL_LEAVE { int k = i * loop + loop_end; int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5; + COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; + { + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); + uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k)))); + uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k)))); + char16 charWeights0 = 0; + char16 charWeights1 = 0; + char16 charWeights2 = 0; + char16 charWeights3 = 0; + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); + UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42); + UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5; + weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7; + } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; + COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2; + COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3; #pragma unroll for (int i = 0; i < remain; ++i){ COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); out = mad(in, weights0_ptr[i], out); out1 = mad(in, weights1_ptr[i], out1); + out2 = mad(in, weights2_ptr[i], out2); + out3 = mad(in, weights3_ptr[i], out3); } } #endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) + } + +#ifdef RELU + out = fmax(out, (COMPUTE_FLOAT4)0); + out1 = fmax(out1, (COMPUTE_FLOAT4)0); + out2 = fmax(out2, (COMPUTE_FLOAT4)0); + out3 = fmax(out3, (COMPUTE_FLOAT4)0); +#endif +#ifdef RELU6 + out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6); + out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6); + out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6); + out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6); +#endif + vstore4(CONVERT_FLOAT4(out), 0, output + out_offset); + vstore4(CONVERT_FLOAT4(out1), 0, output + out_offset + 4); + vstore4(CONVERT_FLOAT4(out2), 0, output + out_offset + 8); + vstore4(CONVERT_FLOAT4(out3), 0, output + out_offset + 12); +} +__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2 + __global const FLOAT* input, + __read_only image2d_t weight, + __global const float *dequantScaleOffset, + __global const FLOAT *bias, + __global FLOAT* output, + __private const int dstChannelC4, + __private const int srcChannelC4, + __private const int blockNum, + __private const int blockDim) { + const int x = get_global_id(0); //c + const int y = get_global_id(1); //b + UNIFORM_BOUNDRY_CHECK(x, y); + + const int out_c_idx = x << 1; + const int out_b_idx = y << 2; + + COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx)); + COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0; + COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1; + + int input_offset = out_b_idx * srcChannelC4 * 4; + int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4; + + const int loop = (blockDim + 15) / 16; + #ifdef INPUT_CHANNEL_LEAVE + const int loop_end = max(loop - 1, 0); + const int remain = blockDim - loop_end*16; + #else + const int loop_end = loop; + #endif + + for (int i = 0; i < blockNum; i++){ + int kindex = i * dstChannelC4 * 4 * 2; + COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex)); for (int j = 0; j < loop_end; j++) { int k = i * loop + j; - int k32 = k << 5; - COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; + int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) + COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; + COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1; { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); char16 charWeights0 = 0; char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3; - weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; - COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2; - COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3; #pragma unroll for (int i = 0; i < 16; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); + COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); out = mad(in, weights0_ptr[i], out); - out1 = mad(in, weights2_ptr[i], out1); - } - #pragma unroll - for (int i = 0; i < 16; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4)); - out = mad(in, weights1_ptr[i], out); - out1 = mad(in, weights3_ptr[i], out1); + out1 = mad(in, weights1_ptr[i], out1); } } #ifdef INPUT_CHANNEL_LEAVE { int k = i * loop + loop_end; - int k32 = k << 5; - COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; + int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) + COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; + COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1; { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); char16 charWeights0 = 0; char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3; - weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; - COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2; - COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3; - for (int i = 0; i < min(16, remain); ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); + #pragma unroll + for (int i = 0; i < remain; ++i){ + COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); out = mad(in, weights0_ptr[i], out); - out1 = mad(in, weights2_ptr[i], out1); - } - for (int i = 16; i < remain; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); - out = mad(in, weights1_ptr[i - 16], out); - out1 = mad(in, weights3_ptr[i - 16], out1); + out1 = mad(in, weights1_ptr[i], out1); } } #endif -#endif //USE_LOW_BIT_WEIGHT_INT4 } #ifdef RELU @@ -669,7 +753,6 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2 int input_offset = out_b_idx * srcChannelC4 * 4; int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4; -#if (defined USE_LOW_BIT_WEIGHT_INT8) const int loop = (blockDim + 15) / 16; #ifdef INPUT_CHANNEL_LEAVE const int loop_end = max(loop - 1, 0); @@ -677,24 +760,24 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2 #else const int loop_end = loop; #endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) - const int loop = (blockDim + 31) / 32; - #ifdef INPUT_CHANNEL_LEAVE - const int loop_end = max(loop - 1, 0); - const int remain = blockDim - loop_end*32; - #else - const int loop_end = loop; - #endif -#endif for (int i = 0; i < blockNum; ++i){ int kindex = i * dstChannelC4 * 4 * 2; COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex)); -#if (defined USE_LOW_BIT_WEIGHT_INT8) for (int j = 0; j < loop_end; j++) { int k = i * loop + j; int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0; + { + uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + char16 charWeights = 0; + UCHAR8_TO_CHAR16(charWeights, charWeightsInt4); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1; + } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; #pragma unroll for (int i = 0; i < 16; ++i){ @@ -706,67 +789,25 @@ __kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2 { int k = i * loop + loop_end; int k16 = k << 4; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; - COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; - #pragma unroll - for (int i = 0; i < remain; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); - out = mad(in, weights0_ptr[i], out); - } - } -#endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) - for (int j = 0; j < loop_end; j++) { - int k = i * loop + j; - int k32 = k << 5; - COMPUTE_FLOAT16 weights0, weights1; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0; { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; + uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + char16 charWeights = 0; + UCHAR8_TO_CHAR16(charWeights, charWeightsInt4); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1; } + #endif COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; - COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; #pragma unroll - for (int i = 0; i < 16; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); - out = mad(in, weights0_ptr[i], out); - } - #pragma unroll - for (int i = 0; i < 16; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4)); - out = mad(in, weights1_ptr[i], out); - } - } -#ifdef INPUT_CHANNEL_LEAVE - { - int k = i * loop + loop_end; - int k32 = k << 5; - COMPUTE_FLOAT16 weights0, weights1; - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0; - COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1; - for (int i = 0; i < min(16, remain); ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); + for (int i = 0; i < remain; ++i){ + COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4)); out = mad(in, weights0_ptr[i], out); } - for (int i = 16; i < remain; ++i){ - COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4)); - out = mad(in, weights1_ptr[i - 16], out); - } } #endif -#endif //USE_LOW_BIT_WEIGHT_INT4 } #ifdef RELU diff --git a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl index 7b4433111..82b7b02db 100644 --- a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl +++ b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl @@ -851,33 +851,37 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2 int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx/4) * height + out_h_idx) * width + out_w_idx) * 4 + (out_c_idx % 4); int wh = width * height * 4; -#if (defined USE_LOW_BIT_WEIGHT_INT8) const int loop = (blockDim + 15) / 16; #ifdef INPUT_CHANNEL_LEAVE const int loop_end = max(loop - 1, 0); #else const int loop_end = loop; #endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) - const int loop = (blockDim + 31) / 32; - #ifdef INPUT_CHANNEL_LEAVE - const int loop_end = max(loop - 1, 0); - #else - const int loop_end = loop; - #endif -#endif for (int i = 0; i < blockNum; ++i){ int kindex = i * dstChannelC4 * 4 * 2; COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex)); - #if (defined USE_LOW_BIT_WEIGHT_INT8) for (int j = 0; j < loop_end; j++) { int k = i * loop + j; #ifndef WIDTH_HEIGHT_1 int k4 = k << 2; #endif + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1; + { + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); + char16 charWeights0 = 0; + char16 charWeights1 = 0; + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + } + #endif { COMPUTE_FLOAT16 in; #ifdef WIDTH_HEIGHT_1 @@ -937,8 +941,22 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2 { int k = i * loop + loop_end; int k4 = k << 2; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0, weights1; + { + uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k)))); + char16 charWeights0 = 0; + char16 charWeights1 = 0; + UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40); + UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; + weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; + } + #endif PADZEROS(k, srcChannel, weights0); PADZEROS(k, srcChannel, weights1); { @@ -981,215 +999,6 @@ __kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2 #endif } #endif - #elif (defined USE_LOW_BIT_WEIGHT_INT4) - for (int j = 0; j < loop_end; j++) { - int k = i * loop + j; - #ifndef WIDTH_HEIGHT_1 - int k8 = k << 3; - #endif - COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3; - weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; - } - { - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 7) * wh)); - #endif - - DOT16X16(in0, weights0, out.s0); - DOT16X16(in1, weights1, out.s0); - DOT16X16(in0, weights2, out.s1); - DOT16X16(in1, weights3, out.s1); - } - #ifdef BACTH_BLOCK4 - if(isValidBatch1){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out1.s0); - DOT16X16(in1, weights1, out1.s0); - DOT16X16(in0, weights2, out1.s1); - DOT16X16(in1, weights3, out1.s1); - } - if(isValidBatch2){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out2.s0); - DOT16X16(in1, weights1, out2.s0); - DOT16X16(in0, weights2, out2.s1); - DOT16X16(in1, weights3, out2.s1); - } - if(isValidBatch3){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out3.s0); - DOT16X16(in1, weights1, out3.s0); - DOT16X16(in0, weights2, out3.s1); - DOT16X16(in1, weights3, out3.s1); - } - #endif - } - #ifdef INPUT_CHANNEL_LEAVE - { - int k = i * loop + loop_end; - int k8 = k << 3; - COMPUTE_FLOAT16 weights0, weights1, weights2, weights3; - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3; - weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3; - } - PADZEROS(k, srcChannel, weights0); - PADZEROS(k + 15, srcChannel, weights1); - PADZEROS(k, srcChannel, weights2); - PADZEROS(k + 15, srcChannel, weights3); - { - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 7) * wh) : (FLOAT4)0); - - DOT16X16(in0, weights0, out.s0); - DOT16X16(in1, weights1, out.s0); - DOT16X16(in0, weights2, out.s1); - DOT16X16(in1, weights3, out.s1); - } - #ifdef BACTH_BLOCK4 - if(isValidBatch1){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 7) * wh) : (FLOAT4)0); - - DOT16X16(in0, weights0, out1.s0); - DOT16X16(in1, weights1, out1.s0); - DOT16X16(in0, weights2, out1.s1); - DOT16X16(in1, weights3, out1.s1); - } - if(isValidBatch2){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 7) * wh) : (FLOAT4)0); - - DOT16X16(in0, weights0, out2.s0); - DOT16X16(in1, weights1, out2.s0); - DOT16X16(in0, weights2, out2.s1); - DOT16X16(in1, weights3, out2.s1); - } - if(isValidBatch3){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 7) * wh) : (FLOAT4)0); - - DOT16X16(in0, weights0, out3.s0); - DOT16X16(in1, weights1, out3.s0); - DOT16X16(in0, weights2, out3.s1); - DOT16X16(in1, weights3, out3.s1); - } - #endif - } - #endif - #endif //USE_LOW_BIT_WEIGHT_INT4 } #ifdef RELU @@ -1281,32 +1090,32 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2 bool isValidBatch3 = out_b_idx + 3 < batch; #endif -#if (defined USE_LOW_BIT_WEIGHT_INT8) const int loop = (blockDim + 15) / 16; #ifdef INPUT_CHANNEL_LEAVE const int loop_end = max(loop - 1, 0); #else const int loop_end = loop; #endif -#elif (defined USE_LOW_BIT_WEIGHT_INT4) - const int loop = (blockDim + 31) / 32; - #ifdef INPUT_CHANNEL_LEAVE - const int loop_end = max(loop - 1, 0); - #else - const int loop_end = loop; - #endif -#endif for (int i = 0; i < blockNum; ++i){ int kindex = i * dstChannelC4 * 4 * 2; COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex)); - #if (defined USE_LOW_BIT_WEIGHT_INT8) for (int j = 0; j < loop_end; j++) { int k = i * loop + j; #ifndef WIDTH_HEIGHT_1 int k4 = k << 2; #endif + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0; + { + uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + char16 charWeights = 0; + UCHAR8_TO_CHAR16(charWeights, charWeightsInt4); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1; + } + #endif { COMPUTE_FLOAT16 in; #ifdef WIDTH_HEIGHT_1 @@ -1362,7 +1171,18 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2 { int k = i * loop + loop_end; int k4 = k << 2; + #if (defined USE_LOW_BIT_WEIGHT_INT8) COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1; + #elif (defined USE_LOW_BIT_WEIGHT_INT4) + COMPUTE_FLOAT16 weights0; + { + uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k)))); + char16 charWeights = 0; + UCHAR8_TO_CHAR16(charWeights, charWeightsInt4); + weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1; + } + #endif + PADZEROS(k, srcChannel, weights0); { COMPUTE_FLOAT16 in; in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh)); @@ -1371,7 +1191,6 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2 in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0); DOT16X16(in, weights0, out); } - PADZEROS(k, srcChannel, weights0); #ifdef BACTH_BLOCK4 if(isValidBatch1){ COMPUTE_FLOAT16 in; @@ -1400,178 +1219,6 @@ __kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2 #endif } #endif - #elif (defined USE_LOW_BIT_WEIGHT_INT4) - for (int j = 0; j < loop_end; j++) { - int k = i * loop + j; - #ifndef WIDTH_HEIGHT_1 - int k8 = k << 3; - #endif - COMPUTE_FLOAT16 weights0, weights1; - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - { - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out); - DOT16X16(in1, weights1, out); - } - - #ifdef BACTH_BLOCK4 - if(isValidBatch1){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out1); - DOT16X16(in1, weights1, out1); - } - if(isValidBatch2){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out2); - DOT16X16(in1, weights1, out2); - } - if(isValidBatch3){ - COMPUTE_FLOAT16 in0, in1; - #ifdef WIDTH_HEIGHT_1 - in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32)); - in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 32 + 16)); - #else - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 1) * wh)); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 2) * wh)); - in0.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 3) * wh)); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 4) * wh)); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 5) * wh)); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 6) * wh)); - in1.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k8 + 7) * wh)); - #endif - DOT16X16(in0, weights0, out3); - DOT16X16(in1, weights1, out3); - } - #endif - } - #ifdef INPUT_CHANNEL_LEAVE - { - int k = i * loop + loop_end; - int k8 = k << 3; - COMPUTE_FLOAT16 weights0, weights1; - { - uchar16 charWeightsInt4 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k))); - char16 charWeights0 = 0; - char16 charWeights1 = 0; - UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4); - weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1; - weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1; - } - PADZEROS(k, srcChannel, weights0); - PADZEROS(k + 15, srcChannel, weights1); - { - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset + (k8 + 7) * wh) : (FLOAT4)0); - DOT16X16(in0, weights0, out); - DOT16X16(in1, weights1, out); - } - - #ifdef BACTH_BLOCK4 - if(isValidBatch1){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset1 + (k8 + 7) * wh) : (FLOAT4)0); - DOT16X16(in0, weights0, out1); - DOT16X16(in1, weights1, out1); - } - if(isValidBatch2){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset2 + (k8 + 7) * wh) : (FLOAT4)0); - DOT16X16(in0, weights0, out2); - DOT16X16(in1, weights1, out2); - } - if(isValidBatch3){ - COMPUTE_FLOAT16 in0, in1; - in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k8 * wh)); - in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 1) * wh) : (FLOAT4)0); - in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 2) * wh) : (FLOAT4)0); - in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 3) * wh) : (FLOAT4)0); - - in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 4) * wh) : (FLOAT4)0); - in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 5) * wh) : (FLOAT4)0); - in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 6) * wh) : (FLOAT4)0); - in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + input_offset3 + (k8 + 7) * wh) : (FLOAT4)0); - DOT16X16(in0, weights0, out3); - DOT16X16(in1, weights1, out3); - } - #endif - } - #endif - #endif //USE_LOW_BIT_WEIGHT_INT4 } #ifdef RELU diff --git a/source/backend/opencl/execution/cl/groupnorm_buf.cl b/source/backend/opencl/execution/cl/groupnorm_buf.cl index 59e848dff..395ac23d1 100644 --- a/source/backend/opencl/execution/cl/groupnorm_buf.cl +++ b/source/backend/opencl/execution/cl/groupnorm_buf.cl @@ -65,8 +65,8 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa sum[lid] = sum[lid] + sum[lid + i]; barrier(CLK_LOCAL_MEM_FENCE); } - float4 square_sum = sum[0] / (float4)inside; - float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon); + float4 square_sum = (float4)(sum[0] / inside); + float4 value = (float4)(1.0f / sqrt(square_sum.x + epsilon)); for(int i = lid; i < inside_v4; i+=LOCAL_SIZE){ float4 in0 = convert_float4(vload4(i, input0 + offset)); @@ -102,7 +102,6 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa barrier(CLK_LOCAL_MEM_FENCE); } - float mean = sum[0] / inside; in_sum = 0; @@ -173,8 +172,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa barrier(CLK_LOCAL_MEM_FENCE); } - float4 mean = sum[0] / (float4)inside; - + float4 mean = (float4)(sum[0] / inside); in_sum = 0; index = lid; for(; index < inside_v4 - 1; index+=LOCAL_SIZE){ @@ -203,8 +201,8 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa sum[lid] = sum[lid] + sum[lid + i]; barrier(CLK_LOCAL_MEM_FENCE); } - float4 square_sum = sum[0] / (float4)inside; - float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon); + float4 square_sum = (float4)(sum[0] / inside); + float4 value = (float4)(1.0f / sqrt(square_sum.x + epsilon)); // The product of W and H is a multiple of 4 #ifdef WH_4 @@ -220,6 +218,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa #ifdef SWISH out = out * native_recip((float4)1+native_exp(convert_float4(-out))); #endif + vstore4(CONVERT_FLOAT4(out), i, output + offset); } #else @@ -235,6 +234,7 @@ __kernel void groupnorm_plain_buf(__private int global_dim0, __private int globa #ifdef SWISH out = out * native_recip(1.0+native_exp(-out)); #endif + output[offset+i] = (FLOAT)out; } #endif diff --git a/source/backend/opencl/execution/cl/matmul_params_buf.cl b/source/backend/opencl/execution/cl/matmul_params_buf.cl index ce1895d44..c4520fc8e 100644 --- a/source/backend/opencl/execution/cl/matmul_params_buf.cl +++ b/source/backend/opencl/execution/cl/matmul_params_buf.cl @@ -77,6 +77,26 @@ #define USE_CL_MAD 0 #endif +// BIAS_TYPE +// 0 -> without bias +// 1 -> with bias (add) [N] +// 2 -> with bias (eltwise_add) [M, N] +// 3 -> with bias (eltwise_sub) [M, N] +// 4 -> with bias (eltwise_sub and get negative) [M, N] +#ifndef BIAS_TYPE + #define BIAS_TYPE 0 +#endif + +#if BIAS_TYPE == 1 +#define DEAL_BIAS(x, a) x = x + a +#elif BIAS_TYPE == 2 +#define DEAL_BIAS(x, a) x = x + a +#elif BIAS_TYPE == 3 +#define DEAL_BIAS(x, a) x = x - a +#elif BIAS_TYPE == 4 +#define DEAL_BIAS(x, a) x = a - x +#endif + // By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size // requirement results in worse performance and is disabled (src/utilities/compile.cpp) #ifndef RELAX_WORKGROUP_SIZE @@ -313,7 +333,7 @@ INLINE_FUNC int GlobalIndexA() { } INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm, const int base, const int _mi, - const int kSizeM, const int idk) { + const int astride/*kSizeM*/, const int idk) { // Computes the indices based on strided/non-strided access #if STRM == 0 // [MWG/MWI, MWI/VWM, VWM] @@ -325,7 +345,7 @@ INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm, const // Loads the data from global memory (not transposed) and stores into registers // [kSizeK, kSizeM/VWM, VWM] - return agm[idk*(kSizeM/VWM) + idm]; + return agm[idk*(astride/VWM)+idm]; } INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm, const int _mi, @@ -366,7 +386,7 @@ INLINE_FUNC int GlobalIndexB() { } INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm, const int base, const int _ni, - const int kSizeN, const int idk) { + const int bstride/*kSizeN*/, const int idk) { // Computes the indices based on strided/non-strided access #if STRN == 0 int idn = base + _ni; @@ -375,7 +395,7 @@ INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm, const #endif // Loads the data from global memory (transposed) and stores into registers - return bgm[idk*(kSizeN/VWN) + idn]; + return bgm[idk*(bstride/VWN)+idn]; } INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm, const int _ni, @@ -677,11 +697,15 @@ INLINE_FUNC INT2 StoreIndexN() { // layout : [M, N] INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, const INT2 baseOffset, - #ifdef BIAS - realN* epm, + #if BIAS_TYPE > 0 + #if BIAS_TYPE > 1 + __global realN* egm, + #else + realN* epm, + #endif #endif const int _mi, const int _ni, - const int kSizeN, const real alpha, const real beta) { + const int cstride/*kSizeN*/, const int dstride/*kSizeN*/, const real alpha, const real beta) { #if STRM == 0 int idm = _mi + baseOffset.index[0]; @@ -694,8 +718,8 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, int idn = baseOffset.index[1] + _ni*NDIMC; #endif - int index = idm * (kSizeN/VWN) + idn; - + int index = idm * (cstride/VWN) + idn; + realN result = c_value; // The final multiplication with alpha (in case beta == 0) @@ -784,11 +808,17 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, #endif -#ifdef BIAS - realN eval = epm[_ni]; - +#if BIAS_TYPE > 0 + #if BIAS_TYPE == 1 + realN eval = epm[_ni]; + #else + + int index_bias = idm * (dstride/VWN) + idn; + realN eval = egm[index_bias]; + #endif + #if VWN == 1 - result += eval; + DEAL_BIAS(result, eval); #ifdef RELU result = fmax(result, (FLOAT)0); #endif @@ -796,8 +826,8 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, result = clamp(result, (FLOAT)0, (FLOAT)6); #endif #elif VWN == 2 - result.x += eval.x; - result.y += eval.y; + DEAL_BIAS(result.x, eval.x); + DEAL_BIAS(result.y, eval.y); #ifdef RELU result = fmax(result, (FLOAT2)0); #endif @@ -805,10 +835,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, result = clamp(result, (FLOAT2)0, (FLOAT2)6); #endif #elif VWN == 4 - result.x += eval.x; - result.y += eval.y; - result.z += eval.z; - result.w += eval.w; + DEAL_BIAS(result.x, eval.x); + DEAL_BIAS(result.y, eval.y); + DEAL_BIAS(result.z, eval.z); + DEAL_BIAS(result.w, eval.w); #ifdef RELU result = fmax(result, (FLOAT4)0); #endif @@ -816,14 +846,14 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, result = clamp(result, (FLOAT4)0, (FLOAT4)6); #endif #elif VWN == 8 - result.s0 += eval.s0; - result.s1 += eval.s1; - result.s2 += eval.s2; - result.s3 += eval.s3; - result.s4 += eval.s4; - result.s5 += eval.s5; - result.s6 += eval.s6; - result.s7 += eval.s7; + DEAL_BIAS(result.s0, eval.s0); + DEAL_BIAS(result.s1, eval.s1); + DEAL_BIAS(result.s2, eval.s2); + DEAL_BIAS(result.s3, eval.s3); + DEAL_BIAS(result.s4, eval.s4); + DEAL_BIAS(result.s5, eval.s5); + DEAL_BIAS(result.s6, eval.s6); + DEAL_BIAS(result.s7, eval.s7); #ifdef RELU result = fmax(result, (FLOAT8)0); #endif @@ -831,22 +861,22 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, result = clamp(result, (FLOAT8)0, (FLOAT8)6); #endif #elif VWN == 16 - result.s0 += eval.s0; - result.s1 += eval.s1; - result.s2 += eval.s2; - result.s3 += eval.s3; - result.s4 += eval.s4; - result.s5 += eval.s5; - result.s6 += eval.s6; - result.s7 += eval.s7; - result.s8 += eval.s8; - result.s9 += eval.s9; - result.sA += eval.sA; - result.sB += eval.sB; - result.sC += eval.sC; - result.sD += eval.sD; - result.sE += eval.sE; - result.sF += eval.sF; + DEAL_BIAS(result.s0, eval.s0); + DEAL_BIAS(result.s1, eval.s1); + DEAL_BIAS(result.s2, eval.s2); + DEAL_BIAS(result.s3, eval.s3); + DEAL_BIAS(result.s4, eval.s4); + DEAL_BIAS(result.s5, eval.s5); + DEAL_BIAS(result.s6, eval.s6); + DEAL_BIAS(result.s7, eval.s7); + DEAL_BIAS(result.s8, eval.s8); + DEAL_BIAS(result.s9, eval.s9); + DEAL_BIAS(result.sA, eval.sA); + DEAL_BIAS(result.sB, eval.sB); + DEAL_BIAS(result.sC, eval.sC); + DEAL_BIAS(result.sD, eval.sD); + DEAL_BIAS(result.sE, eval.sE); + DEAL_BIAS(result.sF, eval.sF); #ifdef RELU result = fmax(result, (FLOAT16)0); #endif @@ -861,10 +891,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value, // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. -INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, +INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const int4 stride, const __global realM* restrict agm, const __global realN* restrict bgm, - #ifdef BIAS - const __global realN* restrict egm, + #if BIAS_TYPE > 0 + __global realN* restrict egm, #endif __global realM* cgm, const real alpha, const real beta #if SA == 1 && SB == 1 @@ -1076,12 +1106,12 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { // Loads data: off-chip --> private (matrix B) - bpm[_ni] = GlobalToPrivateOptB(bgm, baseIndexB, _ni, kSizeN, idk); + bpm[_ni] = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk); } #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { - const realM aval = GlobalToPrivateOptA(agm, baseIndexA, _mi, kSizeM, idk); + const realM aval = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk); #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { #if VWM == 1 @@ -1135,11 +1165,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { // Loads data: off-chip --> private (matrix B) - apm[_mi] = GlobalToPrivateOptA(agm, baseIndexA, _mi, kSizeM, idk); + apm[_mi] = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk); } #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { - const realN bval = GlobalToPrivateOptB(bgm, baseIndexB, _ni, kSizeN, idk); + const realN bval = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk); #pragma unroll for (int _mi = 0; _mi < MWI/VWM; _mi += 1) { @@ -1194,7 +1224,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #ifdef OUTPUTMN INT2 baseOffset = StoreIndexN(); - #ifdef BIAS + #if BIAS_TYPE == 1 #pragma promote_to_registers realN epm[NWI/VWN]; // MWI * 1 for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { @@ -1205,17 +1235,22 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #endif epm[_ni] = egm[idn]; } - #endif + #endif + + + #pragma unroll for (int _mi = 0; _mi < MWI; _mi += 1) { #pragma unroll for (int _ni = 0; _ni < NWI/VWN; _ni += 1) { StoreResultsN((__global realN* )cgm, cpn[_mi * (NWI/VWN) + _ni], baseOffset, - #ifdef BIAS + #if BIAS_TYPE > 1 + (__global realN*)egm, + #elif BIAS_TYPE == 1 (realN*)epm, #endif - _mi, _ni, kSizeN, alpha, beta); + _mi, _ni, stride.s2, stride.s3, alpha, beta); } } @@ -1246,20 +1281,24 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_beta, const __global realM* restrict agm, // [K, M] const __global realN* restrict bgm, // [K, N] - #ifdef BIAS - const __global realN* restrict egm, // [N] + #if BIAS_TYPE > 0 + __global realN* restrict egm, // [N] #endif __global realM* cgm, - const int a_offset, const int b_offset, const int c_offset + __private const int4 offset, + __private const int4 stride ) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) - agm = (const __global realM*)((const __global real*)agm + a_offset); - bgm = (const __global realN*)((const __global real*)bgm + b_offset); - cgm = (__global realM*)((const __global real*)cgm + c_offset); + agm = (const __global realM*)((const __global real*)agm + offset.s0); + bgm = (const __global realN*)((const __global real*)bgm + offset.s1); + cgm = (__global realM*)((__global real*)cgm + offset.s2); + #if BIAS_TYPE > 0 + egm = (__global realN*)((__global real*)egm + offset.s3); + #endif // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; @@ -1270,26 +1309,26 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm, + #if BIAS_TYPE > 0 egm, #endif cgm, alpha, beta, alm, blm); #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm, + #if BIAS_TYPE > 0 egm, #endif cgm, alpha, beta, alm); #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm, + #if BIAS_TYPE > 0 egm, #endif cgm, alpha, beta, blm); #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm, + #if BIAS_TYPE > 0 egm, #endif cgm, alpha, beta); @@ -1301,15 +1340,21 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, #else __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) #endif -void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, +void XgemmBatched(const int kSizeM, + const int kSizeN, + const int kSizeK, const real_arg arg_alpha, const real_arg arg_beta, - const __global realM* restrict agm, const int batch_offset_a, - const __global realN* restrict bgm, const int batch_offset_b, - #ifdef BIAS - const __global realN* restrict egm, const int batch_offset_e, + const __global realM* restrict agm, + const int batch_offset_a, + const __global realN* restrict bgm, + const int batch_offset_b, + #if BIAS_TYPE > 0 + __global realN* restrict egm, + const int batch_offset_e, #endif - __global realM* cgm, const int batch_offset_c) { + __global realM* cgm, + const int batch_offset_c) { const int batch = get_group_id(2); const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -1322,9 +1367,9 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, const __global realN* restrict bgm_ = &bgm[b_offset / VWN]; __global realM* restrict cgm_ = &cgm[c_offset / VWM]; - #ifdef BIAS + #if BIAS_TYPE > 0 const int e_offset = batch * batch_offset_e; - const __global realN* restrict egm_ = &egm[e_offset / VWN]; + __global realN* restrict egm_ = &egm[e_offset / VWN]; #endif // Allocates workgroup-private memory (local memory) @@ -1334,31 +1379,40 @@ void XgemmBatched(const int kSizeM, const int kSizeN, const int kSizeK, #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif - + int4 stride; + stride.s0 = kSizeM; + stride.s1 = kSizeN; + #ifdef OUTPUTMN + stride.s2 = kSizeN; + #else + stride.s2 = kSizeM; + #endif + stride.s3 = kSizeN; // Computes the matrix-multiplication and stores the result in global memory #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_, + #if BIAS_TYPE > 0 egm_, #endif cgm_, alpha, beta, alm, blm); #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_, + #if BIAS_TYPE > 0 egm_, #endif cgm_, alpha, beta, alm); #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_, + #if BIAS_TYPE > 0 egm_, #endif cgm_, alpha, beta, blm); #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm_, bgm_, - #ifdef BIAS + XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_, + #if BIAS_TYPE > 0 egm_, #endif cgm_, alpha, beta); #endif } + diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc index f98f47417..7eebdd550 100644 --- a/source/backend/opencl/execution/cl/opencl_program.cc +++ b/source/backend/opencl/execution/cl/opencl_program.cc @@ -3092,32 +3092,36 @@ const char* gemv_conv1x1_buf = " int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n" " int out_offset=(((out_b_idx*dstChannelC4+out_c_idx/4)*height+out_h_idx)*width+out_w_idx)*4+(out_c_idx % 4);\n" " int wh=width*height*4;\n" -"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n" " const int loop=(blockDim+15)/16;\n" " #ifdef INPUT_CHANNEL_LEAVE\n" " const int loop_end=max(loop-1,0);\n" " #else\n" " const int loop_end=loop;\n" " #endif\n" -"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n" -" const int loop=(blockDim+31)/32;\n" -" #ifdef INPUT_CHANNEL_LEAVE\n" -" const int loop_end=max(loop-1,0);\n" -" #else\n" -" const int loop_end=loop;\n" -" #endif\n" -"#endif\n" " for (int i=0; i= 4) {\n" -" int offset=buffer_offset;\n" -" output_values_int8.x=(char)(*(input_ptr+offset));\n" -" offset=mad24(1,ic_h_w_size,offset);\n" -" output_values_int8.y=(char)(*(input_ptr+offset));\n" -" offset += ic_h_w_size;\n" -" output_values_int8.z=(char)(*(input_ptr+offset));\n" -" offset += ic_h_w_size;\n" -" output_values_int8.w=(char)(*(input_ptr+offset));\n" -" } else if (remain_channel == 3) {\n" -" int offset=buffer_offset;\n" -" output_values_int8.x=(char)(*(input_ptr+offset));\n" -" offset=mad24(1,ic_h_w_size,offset);\n" -" output_values_int8.y=(char)(*(input_ptr+offset));\n" -" offset += ic_h_w_size;\n" -" output_values_int8.z=(char)(*(input_ptr+offset));\n" -" \n" -" } else if (remain_channel == 2) {\n" -" int offset=buffer_offset;\n" -" output_values_int8.x=(char)(*(input_ptr+offset));\n" -" offset=mad24(1,ic_h_w_size,offset);\n" -" output_values_int8.y=(char)(*(input_ptr+offset));\n" -" } else if (remain_channel == 1) {\n" -" int offset=buffer_offset;\n" -" output_values_int8.x=(char)(*(input_ptr+offset));\n" -" }\n" -" }\n" -" \n" +" const int buffer_offset=output_channel_4_idx*ic_h_w_size+input_channel_4_idx*height_width_size+buffer_height_idx*kernel_shape.y+buffer_width_idx;\n" +" int index0=buffer_offset,index1=buffer_offset+ic_h_w_size,index2=buffer_offset+2*ic_h_w_size,index3=buffer_offset+3*ic_h_w_size;\n" " uchar2 output_values_int4=(uchar2)(0,0);\n" -" output_values_int4.s0=(output_values_int8.x+8)*16+(output_values_int8.y+8);\n" -" output_values_int4.s1=(output_values_int8.z+8)*16+(output_values_int8.w+8);\n" -" \n" +" uchar s0=input_ptr[index0/2];\n" +" uchar s1=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index1/2];\n" +" uchar s2=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index2/2];\n" +" uchar s3=output_channel_4_idx+1 >= output_channel ? 0 : input_ptr[index3/2];\n" +" output_values_int4.x=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n" +" output_values_int4.y=((index2 % 2) == 0 ? (s2 & 0xf0) : (s2 << 4)) | ((index3 % 2) == 0 ? (s3 >> 4) : (s3 & 0x0f));\n" " const int out_offset=(image_width_idx*height_width_size*((output_channel+3)/4)+image_height_idx)*2;\n" " vstore2(output_values_int4,0,output+out_offset);\n" "}\n" @@ -11795,106 +11450,127 @@ const char* buffer_convert_quant = "#define CHAR16_TO_UCHAR8(a, b) "" a=(uchar8)(((b.s0+8) << 4)+b.s1+8,((b.s2+8) << 4)+b.s3+8,((b.s4+8) << 4)+b.s5+8,((b.s6+8) << 4)+b.s7+8,((b.s8+8) << 4)+b.s9+8,((b.sa+8) << 4)+b.sb+8,((b.sc+8) << 4)+b.sd+8,((b.se+8) << 4)+b.sf+8);\n" "#define CHAR32_TO_UCHAR16(a, b, c) "" a = (uchar16)(((b.s0 + 8) << 4) + b.s1 + 8, ((b.s2 + 8) << 4) + b.s3 + 8, ((b.s4 + 8) << 4) + b.s5 + 8, ((b.s6 + 8) << 4) + b.s7 + 8, ((b.s8 + 8) << 4) + b.s9 + 8, ((b.sa + 8) << 4) + b.sb + 8, ((b.sc + 8) << 4) + b.sd + 8, ((b.se + 8) << 4) + b.sf + 8, "" ((c.s0+8) << 4)+c.s1+8,((c.s2+8) << 4)+c.s3+8,((c.s4+8) << 4)+c.s5+8,((c.s6+8) << 4)+c.s7+8,((c.s8+8) << 4)+c.s9+8,((c.sa+8) << 4)+c.sb+8,((c.sc+8) << 4)+c.sd+8,((c.se+8) << 4)+c.sf+8);\n" "__kernel void conv2d_1x1_weight_quant_buffer(GLOBAL_SIZE_2_DIMS\n" -" __global const char *input_ptr,\n" "#ifdef USE_LOW_BIT_WEIGHT_INT4\n" -" __global uchar *output_ptr,\n" +" __global const uchar *input_ptr,\n" "#else\n" -" __global char *output_ptr,\n" +" __global const char *input_ptr,\n" "#endif\n" +" __global char *output_ptr,\n" " __private const int input_channel,\n" " __private const int output_channel) {\n" " int x=get_global_id(0); // ic/16\n" " int y=get_global_id(1); // oc\n" -" \n" " DEAL_NON_UNIFORM_DIM2(x,y);\n" " const int xin=x << 4;\n" " const int outputChannelC4=(output_channel+3) >> 2;\n" -" const int inputOffset=y*input_channel+xin;\n" -" char16 weight=0;\n" -"#ifdef INPUT_CHANNEL_LEAVE\n" -" if(xin+15 >= input_channel){\n" -" char *weight_ptr=(char*)&weight;\n" -" for(int i=0,j=0; xin+i> 4) : (s1 & 0x0f));\n" " }\n" "#else\n" -" weight=vload16(0,input_ptr+inputOffset);\n" +" const int inputOffset=(y*input_channel+xin)/2;\n" +" vstore8(convert_char8(vload8(0,input_ptr+inputOffset)),0,output_ptr+outputOffset);\n" "#endif\n" -" \n" -"#ifdef USE_LOW_BIT_WEIGHT_INT4\n" -" const int outputOffset=((x*outputChannelC4*4*8+y*8));\n" -" uchar8 outWeight;\n" -" CHAR16_TO_UCHAR8(outWeight,weight);\n" -" vstore8(outWeight,0,output_ptr+outputOffset);\n" "#else\n" +" const int inputOffset=y*input_channel+xin;\n" " const int outputOffset=(x*outputChannelC4*4+y) << 4;\n" -" vstore16(weight,0,output_ptr+outputOffset);\n" +" vstore16(convert_char16(vload16(0,input_ptr+inputOffset)),0,output_ptr+outputOffset);\n" "#endif\n" "}\n" "__kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS\n" -" __global const char *input_ptr,\n" +"#ifdef USE_LOW_BIT_WEIGHT_INT4\n" +" __global const uchar *input_ptr,\n" +"#else\n" +" __global const uchar *input_ptr,\n" +"#endif\n" " __write_only image2d_t output,\n" " __private const int input_channel,\n" " __private const int output_channel) {\n" -" \n" -"#ifdef USE_LOW_BIT_WEIGHT_INT4\n" -" int x=get_global_id(0); // ic/32\n" +" int x=get_global_id(0); // ic/16\n" " int y=get_global_id(1); // oc\n" -" \n" " DEAL_NON_UNIFORM_DIM2(x,y);\n" -" const int outputChannelC4=(output_channel+3) >> 2;\n" -" const int xin=x << 5;\n" -" const int inputOffset=y*input_channel+xin;\n" -" char16 weight00=0,weight01=0;\n" -"#ifdef INPUT_CHANNEL_LEAVE\n" -" if(xin+31 >= input_channel){\n" -" char *weight00_ptr=(char*)&weight00;\n" -" char *weight01_ptr=(char*)&weight01;\n" -" int i=0;\n" -" for(int j=0; xin+i> 4) : (s1 & 0x0f));\n" +" }\n" +" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(out)));\n" "#else\n" -" weight00=vload16(0,input_ptr+inputOffset);\n" -" weight01=vload16(0,input_ptr+inputOffset+16);\n" +" const int inputOffset=(y*input_channel+xin)/2;\n" +" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(vload8(0,input_ptr+inputOffset))));\n" "#endif\n" -" \n" -" uchar16 outWeight;\n" -" CHAR32_TO_UCHAR16(outWeight,weight00,weight01);\n" -" write_imagei(output,(int2)(y,x),as_int4(outWeight));\n" "#else\n" -" int x=get_global_id(0); // ic/16\n" -" int y=get_global_id(1); // oc\n" -" \n" -" DEAL_NON_UNIFORM_DIM2(x,y);\n" -" const int xin=x << 4;\n" " const int inputOffset=y*input_channel+xin;\n" -" const int outputChannelC4=(output_channel+3) >> 2;\n" -" char16 weight=0;\n" -"#ifdef INPUT_CHANNEL_LEAVE\n" -" if(xin+15 >= input_channel){\n" -" char *weight_ptr=(char*)&weight;\n" -" for(int i=0,j=0; xin+i> 4) : (s1 & 0x0f);\n" +" output_ptr[outputOffset+i*(ocPack/2)+j]=s0 | s1;\n" " }\n" -" }else {\n" -" weight=vload16(0,input_ptr+inputOffset);\n" " }\n" "#else\n" -" weight=vload16(0,input_ptr+inputOffset);\n" +" for(int i=0; i> 4);\n" +" char d1=((s0 & 0x0f) << 4) | (s1 & 0x0f);\n" +" output_ptr[outputOffset+(i*2)*(ocPack/2)+j]=d0;\n" +" output_ptr[outputOffset+(i*2+1)*(ocPack/2)+j]=d1;\n" +" }\n" +" }\n" "#endif\n" -" \n" -" write_imagei(output,(int2)(y,x),as_int4(weight));\n" +"#else\n" +" const int inputOffset=yin*input_channel+xin;\n" +" const int outputOffset=(x*outputChannelC4+y)*icPack*ocPack;\n" +" for(int i=0; i= 2\n" +" {\n" +" offsetA += strideA;\n" +" offsetB += strideB;\n" +" offsetC += strideC;\n" +" FLOAT8 in0=vload8(0,input0+offsetA);\n" +" FLOAT8 in1=vload8(0,input1+offsetB);\n" +" FLOAT8 out=OPERATOR;\n" +" vstore8(out,0,output+offsetC);\n" +" }\n" +" #endif\n" +" #if VEC_H == 4\n" +" {\n" +" offsetA += strideA;\n" +" offsetB += strideB;\n" +" offsetC += strideC;\n" +" FLOAT8 in0=vload8(0,input0+offsetA);\n" +" FLOAT8 in1=vload8(0,input1+offsetB);\n" +" FLOAT8 out=OPERATOR;\n" +" vstore8(out,0,output+offsetC);\n" +" }\n" +" {\n" +" offsetA += strideA;\n" +" offsetB += strideB;\n" +" offsetC += strideC;\n" +" FLOAT8 in0=vload8(0,input0+offsetA);\n" +" FLOAT8 in1=vload8(0,input1+offsetB);\n" +" FLOAT8 out=OPERATOR;\n" +" vstore8(out,0,output+offsetC);\n" +" }\n" +" #endif\n" +" }\n" +"}\n" +; +#endif +#ifndef MNN_OPENCL_BUFFER_CLOSED const char* matmul_params_buf = "#ifdef MNN_SUPPORT_FP16\n" "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" @@ -19442,6 +19216,24 @@ const char* matmul_params_buf = "#ifndef USE_CL_MAD\n" " #define USE_CL_MAD 0\n" "#endif\n" +"// BIAS_TYPE\n" +"// 0 -> without bias\n" +"// 1 -> with bias (add) [N]\n" +"// 2 -> with bias (eltwise_add) [M,N]\n" +"// 3 -> with bias (eltwise_sub) [M,N]\n" +"// 4 -> with bias (eltwise_sub and get negative) [M,N]\n" +"#ifndef BIAS_TYPE\n" +" #define BIAS_TYPE 0\n" +"#endif\n" +"#if BIAS_TYPE == 1\n" +"#define DEAL_BIAS(x,a) x=x+a\n" +"#elif BIAS_TYPE == 2\n" +"#define DEAL_BIAS(x,a) x=x+a\n" +"#elif BIAS_TYPE == 3\n" +"#define DEAL_BIAS(x,a) x=x-a\n" +"#elif BIAS_TYPE == 4\n" +"#define DEAL_BIAS(x,a) x=a-x\n" +"#endif\n" "// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size\n" "// requirement results in worse performance and is disabled (src/utilities/compile.cpp)\n" "#ifndef RELAX_WORKGROUP_SIZE\n" @@ -19652,7 +19444,7 @@ const char* matmul_params_buf = " return idm;\n" "}\n" "INLINE_FUNC realM GlobalToPrivateOptA(const __global realM* restrict agm,const int base,const int _mi,\n" -" const int kSizeM,const int idk) {\n" +" const int astride/*kSizeM*/,const int idk) {\n" " // Computes the indices based on strided/non-strided access\n" " #if STRM == 0\n" " // [MWG/MWI,MWI/VWM,VWM]\n" @@ -19663,7 +19455,7 @@ const char* matmul_params_buf = " #endif\n" " // Loads the data from global memory (not transposed) and stores into registers\n" " // [kSizeK,kSizeM/VWM,VWM]\n" -" return agm[idk*(kSizeM/VWM)+idm];\n" +" return agm[idk*(astride/VWM)+idm];\n" "}\n" "INLINE_FUNC realM GlobalToPrivateA(const __global realM* restrict agm,const int _mi,\n" " const int kSizeM,const int idk) {\n" @@ -19697,7 +19489,7 @@ const char* matmul_params_buf = " return idn;\n" "}\n" "INLINE_FUNC realN GlobalToPrivateOptB(const __global realN* restrict bgm,const int base,const int _ni,\n" -" const int kSizeN,const int idk) {\n" +" const int bstride/*kSizeN*/,const int idk) {\n" " // Computes the indices based on strided/non-strided access\n" " #if STRN == 0\n" " int idn=base+_ni;\n" @@ -19705,7 +19497,7 @@ const char* matmul_params_buf = " int idn=base+_ni*NDIMC;\n" " #endif\n" " // Loads the data from global memory (transposed) and stores into registers\n" -" return bgm[idk*(kSizeN/VWN)+idn];\n" +" return bgm[idk*(bstride/VWN)+idn];\n" "}\n" "INLINE_FUNC realN GlobalToPrivateB(const __global realN* restrict bgm,const int _ni,\n" " const int kSizeN,const int idk) {\n" @@ -19990,11 +19782,15 @@ const char* matmul_params_buf = "// layout : [M,N]\n" "INLINE_FUNC void StoreResultsN(__global realN* cgn,realN c_value,\n" " const INT2 baseOffset,\n" -" #ifdef BIAS\n" +" #if BIAS_TYPE>0\n" +" #if BIAS_TYPE>1\n" +" __global realN* egm,\n" +" #else\n" " realN* epm,\n" " #endif\n" +" #endif\n" " const int _mi,const int _ni,\n" -" const int kSizeN,const real alpha,const real beta) {\n" +" const int cstride/*kSizeN*/,const int dstride/*kSizeN*/,const real alpha,const real beta) {\n" " #if STRM == 0\n" " int idm=_mi+baseOffset.index[0];\n" " #elif STRM == 1\n" @@ -20005,7 +19801,8 @@ const char* matmul_params_buf = " #elif STRN == 1\n" " int idn=baseOffset.index[1]+_ni*NDIMC;\n" " #endif\n" -" int index=idm*(kSizeN/VWN)+idn;\n" +" int index=idm*(cstride/VWN)+idn;\n" +" \n" " realN result=c_value;\n" " \n" " // The final multiplication with alpha (in case beta == 0)\n" @@ -20093,10 +19890,17 @@ const char* matmul_params_buf = " #endif\n" " \n" " \n" -"#ifdef BIAS\n" +"#if BIAS_TYPE>0\n" +" #if BIAS_TYPE == 1\n" " realN eval=epm[_ni];\n" +" #else\n" +" \n" +" int index_bias=idm*(dstride/VWN)+idn;\n" +" realN eval=egm[index_bias];\n" +" #endif\n" +" \n" " #if VWN == 1\n" -" result += eval;\n" +" DEAL_BIAS(result,eval);\n" " #ifdef RELU\n" " result=fmax(result,(FLOAT)0);\n" " #endif\n" @@ -20104,8 +19908,8 @@ const char* matmul_params_buf = " result=clamp(result,(FLOAT)0,(FLOAT)6);\n" " #endif\n" " #elif VWN == 2\n" -" result.x += eval.x;\n" -" result.y += eval.y;\n" +" DEAL_BIAS(result.x,eval.x);\n" +" DEAL_BIAS(result.y,eval.y);\n" " #ifdef RELU\n" " result=fmax(result,(FLOAT2)0);\n" " #endif\n" @@ -20113,10 +19917,10 @@ const char* matmul_params_buf = " result=clamp(result,(FLOAT2)0,(FLOAT2)6);\n" " #endif\n" " #elif VWN == 4\n" -" result.x += eval.x;\n" -" result.y += eval.y;\n" -" result.z += eval.z;\n" -" result.w += eval.w;\n" +" DEAL_BIAS(result.x,eval.x);\n" +" DEAL_BIAS(result.y,eval.y);\n" +" DEAL_BIAS(result.z,eval.z);\n" +" DEAL_BIAS(result.w,eval.w);\n" " #ifdef RELU\n" " result=fmax(result,(FLOAT4)0);\n" " #endif\n" @@ -20124,14 +19928,14 @@ const char* matmul_params_buf = " result=clamp(result,(FLOAT4)0,(FLOAT4)6);\n" " #endif\n" " #elif VWN == 8\n" -" result.s0 += eval.s0;\n" -" result.s1 += eval.s1;\n" -" result.s2 += eval.s2;\n" -" result.s3 += eval.s3;\n" -" result.s4 += eval.s4;\n" -" result.s5 += eval.s5;\n" -" result.s6 += eval.s6;\n" -" result.s7 += eval.s7;\n" +" DEAL_BIAS(result.s0,eval.s0);\n" +" DEAL_BIAS(result.s1,eval.s1);\n" +" DEAL_BIAS(result.s2,eval.s2);\n" +" DEAL_BIAS(result.s3,eval.s3);\n" +" DEAL_BIAS(result.s4,eval.s4);\n" +" DEAL_BIAS(result.s5,eval.s5);\n" +" DEAL_BIAS(result.s6,eval.s6);\n" +" DEAL_BIAS(result.s7,eval.s7);\n" " #ifdef RELU\n" " result=fmax(result,(FLOAT8)0);\n" " #endif\n" @@ -20139,22 +19943,22 @@ const char* matmul_params_buf = " result=clamp(result,(FLOAT8)0,(FLOAT8)6);\n" " #endif\n" " #elif VWN == 16\n" -" result.s0 += eval.s0;\n" -" result.s1 += eval.s1;\n" -" result.s2 += eval.s2;\n" -" result.s3 += eval.s3;\n" -" result.s4 += eval.s4;\n" -" result.s5 += eval.s5;\n" -" result.s6 += eval.s6;\n" -" result.s7 += eval.s7;\n" -" result.s8 += eval.s8;\n" -" result.s9 += eval.s9;\n" -" result.sA += eval.sA;\n" -" result.sB += eval.sB;\n" -" result.sC += eval.sC;\n" -" result.sD += eval.sD;\n" -" result.sE += eval.sE;\n" -" result.sF += eval.sF;\n" +" DEAL_BIAS(result.s0,eval.s0);\n" +" DEAL_BIAS(result.s1,eval.s1);\n" +" DEAL_BIAS(result.s2,eval.s2);\n" +" DEAL_BIAS(result.s3,eval.s3);\n" +" DEAL_BIAS(result.s4,eval.s4);\n" +" DEAL_BIAS(result.s5,eval.s5);\n" +" DEAL_BIAS(result.s6,eval.s6);\n" +" DEAL_BIAS(result.s7,eval.s7);\n" +" DEAL_BIAS(result.s8,eval.s8);\n" +" DEAL_BIAS(result.s9,eval.s9);\n" +" DEAL_BIAS(result.sA,eval.sA);\n" +" DEAL_BIAS(result.sB,eval.sB);\n" +" DEAL_BIAS(result.sC,eval.sC);\n" +" DEAL_BIAS(result.sD,eval.sD);\n" +" DEAL_BIAS(result.sE,eval.sE);\n" +" DEAL_BIAS(result.sF,eval.sF);\n" " #ifdef RELU\n" " result=fmax(result,(FLOAT16)0);\n" " #endif\n" @@ -20166,10 +19970,10 @@ const char* matmul_params_buf = " cgn[index]=result;\n" "}\n" "// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.\n" -"INLINE_FUNC void XgemmBody(const int kSizeM,const int kSizeN,const int kSizeK,\n" +"INLINE_FUNC void XgemmBody(const int kSizeM,const int kSizeN,const int kSizeK,const int4 stride,\n" " const __global realM* restrict agm,const __global realN* restrict bgm,\n" -" #ifdef BIAS\n" -" const __global realN* restrict egm,\n" +" #if BIAS_TYPE>0\n" +" __global realN* restrict egm,\n" " #endif\n" " __global realM* cgm,const real alpha,const real beta\n" " #if SA == 1 && SB == 1\n" @@ -20370,11 +20174,11 @@ const char* matmul_params_buf = " #pragma unroll\n" " for (int _ni=0; _ni private (matrix B)\n" -" bpm[_ni]=GlobalToPrivateOptB(bgm,baseIndexB,_ni,kSizeN,idk);\n" +" bpm[_ni]=GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk);\n" " }\n" " #pragma unroll\n" " for (int _mi=0; _mi private (matrix B)\n" -" apm[_mi]=GlobalToPrivateOptA(agm,baseIndexA,_mi,kSizeM,idk);\n" +" apm[_mi]=GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk);\n" " }\n" " #pragma unroll\n" " for (int _ni=0; _ni1\n" +" (__global realN*)egm,\n" +" #elif BIAS_TYPE == 1\n" " (realN*)epm,\n" " #endif\n" -" _mi,_ni,kSizeN,alpha,beta);\n" +" _mi,_ni,stride.s2,stride.s3,alpha,beta);\n" " }\n" " }\n" " \n" @@ -20535,20 +20344,24 @@ const char* matmul_params_buf = " const real_arg arg_beta,\n" " const __global realM* restrict agm,// [K,M]\n" " const __global realN* restrict bgm,// [K,N]\n" -" #ifdef BIAS\n" -" const __global realN* restrict egm,// [N]\n" +" #if BIAS_TYPE>0\n" +" __global realN* restrict egm,// [N]\n" " #endif\n" " __global realM* cgm,\n" -" const int a_offset,const int b_offset,const int c_offset\n" +" __private const int4 offset,\n" +" __private const int4 stride\n" ") {\n" " const real alpha=GetRealArg(arg_alpha);\n" " const real beta=GetRealArg(arg_beta);\n" " \n" " // Adds the offsets (in case of use of a single temporary buffer for A,B,and C)\n" -" agm=(const __global realM*)((const __global real*)agm+a_offset);\n" -" bgm=(const __global realN*)((const __global real*)bgm+b_offset);\n" -" cgm=(__global realM*)((const __global real*)cgm+c_offset);\n" +" agm=(const __global realM*)((const __global real*)agm+offset.s0);\n" +" bgm=(const __global realN*)((const __global real*)bgm+offset.s1);\n" +" cgm=(__global realM*)((__global real*)cgm+offset.s2);\n" " \n" +" #if BIAS_TYPE>0\n" +" egm=(__global realN*)((__global real*)egm+offset.s3);\n" +" #endif\n" " // Allocates workgroup-private memory (local memory)\n" " #if SA == 1\n" " __local realM alm[KWG*MWG/VWM];\n" @@ -20559,26 +20372,26 @@ const char* matmul_params_buf = " \n" " // Computes the matrix-multiplication and stores the result in global memory\n" " #if SA == 1 && SB == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n" +" #if BIAS_TYPE>0\n" " egm,\n" " #endif\n" " cgm,alpha,beta,alm,blm);\n" " #elif SA == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n" +" #if BIAS_TYPE>0\n" " egm,\n" " #endif\n" " cgm,alpha,beta,alm);\n" " #elif SB == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n" +" #if BIAS_TYPE>0\n" " egm,\n" " #endif\n" " cgm,alpha,beta,blm);\n" " #else\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm,bgm,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n" +" #if BIAS_TYPE>0\n" " egm,\n" " #endif\n" " cgm,alpha,beta);\n" @@ -20589,15 +20402,21 @@ const char* matmul_params_buf = "#else\n" " __kernel __attribute__((reqd_work_group_size(MDIMC,NDIMC,1)))\n" "#endif\n" -"void XgemmBatched(const int kSizeM,const int kSizeN,const int kSizeK,\n" +"void XgemmBatched(const int kSizeM,\n" +" const int kSizeN,\n" +" const int kSizeK,\n" " const real_arg arg_alpha,\n" " const real_arg arg_beta,\n" -" const __global realM* restrict agm,const int batch_offset_a,\n" -" const __global realN* restrict bgm,const int batch_offset_b,\n" -" #ifdef BIAS\n" -" const __global realN* restrict egm,const int batch_offset_e,\n" +" const __global realM* restrict agm,\n" +" const int batch_offset_a,\n" +" const __global realN* restrict bgm,\n" +" const int batch_offset_b,\n" +" #if BIAS_TYPE>0\n" +" __global realN* restrict egm,\n" +" const int batch_offset_e,\n" " #endif\n" -" __global realM* cgm,const int batch_offset_c) {\n" +" __global realM* cgm,\n" +" const int batch_offset_c) {\n" " const int batch=get_group_id(2);\n" " const real alpha=GetRealArg(arg_alpha);\n" " const real beta=GetRealArg(arg_beta);\n" @@ -20610,9 +20429,9 @@ const char* matmul_params_buf = " const __global realN* restrict bgm_=&bgm[b_offset/VWN];\n" " __global realM* restrict cgm_=&cgm[c_offset/VWM];\n" " \n" -" #ifdef BIAS\n" +" #if BIAS_TYPE>0\n" " const int e_offset=batch*batch_offset_e;\n" -" const __global realN* restrict egm_=&egm[e_offset/VWN];\n" +" __global realN* restrict egm_=&egm[e_offset/VWN];\n" " #endif\n" " \n" " // Allocates workgroup-private memory (local memory)\n" @@ -20622,29 +20441,37 @@ const char* matmul_params_buf = " #if SB == 1\n" " __local realN blm[KWG*NWG/VWN];\n" " #endif\n" -" \n" +" int4 stride;\n" +" stride.s0=kSizeM;\n" +" stride.s1=kSizeN;\n" +" #ifdef OUTPUTMN\n" +" stride.s2=kSizeN;\n" +" #else\n" +" stride.s2=kSizeM;\n" +" #endif\n" +" stride.s3=kSizeN;\n" " // Computes the matrix-multiplication and stores the result in global memory\n" " #if SA == 1 && SB == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n" +" #if BIAS_TYPE>0\n" " egm_,\n" " #endif\n" " cgm_,alpha,beta,alm,blm);\n" " #elif SA == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n" +" #if BIAS_TYPE>0\n" " egm_,\n" " #endif\n" " cgm_,alpha,beta,alm);\n" " #elif SB == 1\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n" +" #if BIAS_TYPE>0\n" " egm_,\n" " #endif\n" " cgm_,alpha,beta,blm);\n" " #else\n" -" XgemmBody(kSizeM,kSizeN,kSizeK,agm_,bgm_,\n" -" #ifdef BIAS\n" +" XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n" +" #if BIAS_TYPE>0\n" " egm_,\n" " #endif\n" " cgm_,alpha,beta);\n" diff --git a/source/backend/opencl/execution/cl/opencl_source_map.hpp b/source/backend/opencl/execution/cl/opencl_source_map.hpp index 8092ae61e..6ec7a2399 100644 --- a/source/backend/opencl/execution/cl/opencl_source_map.hpp +++ b/source/backend/opencl/execution/cl/opencl_source_map.hpp @@ -150,6 +150,9 @@ extern const char* input_transe_buf; extern const char* reduction_buf; #endif #ifndef MNN_OPENCL_BUFFER_CLOSED +extern const char* strassen_binary_buf; +#endif +#ifndef MNN_OPENCL_BUFFER_CLOSED extern const char* matmul_params_buf; #endif extern const char* cast; @@ -317,6 +320,9 @@ const std::map OpenCLProgramMap = #ifndef MNN_OPENCL_BUFFER_CLOSED { "reduction_buf", reduction_buf }, #endif +#ifndef MNN_OPENCL_BUFFER_CLOSED + { "strassen_binary_buf", strassen_binary_buf }, +#endif #ifndef MNN_OPENCL_BUFFER_CLOSED { "matmul_params_buf", matmul_params_buf }, #endif diff --git a/source/backend/opencl/execution/cl/strassen_binary_buf.cl b/source/backend/opencl/execution/cl/strassen_binary_buf.cl new file mode 100644 index 000000000..76894d266 --- /dev/null +++ b/source/backend/opencl/execution/cl/strassen_binary_buf.cl @@ -0,0 +1,101 @@ +#ifdef MNN_SUPPORT_FP16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif + +__kernel void binary_cfunction_buf(__private int global_dim0, __private int global_dim1, + __global FLOAT* input0, + __private const int offsetC, + __private const int strideC, + __global FLOAT* input1, __global FLOAT* output, + __private const int width,//[offsetA, offsetB, offsetC, 0] + __private const int height//[strideA, strideB, strideC, 0] +) { + int2 pos = (int2)(get_global_id(0), get_global_id(1));// [X/16, Y] + + if (pos.x < global_dim0 && pos.y < global_dim1) { + int offset_11 = offsetC + pos.x * 8 + pos.y * strideC; + int offset_12 = offset_11 + width; + int offset_21 = offset_11 + strideC * height; + int offset_22 = offset_21 + width; + + FLOAT8 in_11 = vload8(0, input0 + offset_11); + FLOAT8 in_12 = vload8(0, input0 + offset_12); + FLOAT8 in_21 = vload8(0, input0 + offset_21); + FLOAT8 in_22 = vload8(0, input0 + offset_22); + FLOAT8 in_cx = vload8(0, input1 + pos.x * 8 + pos.y * width); + + in_12 = in_12 + in_cx; + in_21 = in_12 + in_21; + in_12 = in_22 + in_12; + in_22 = in_22 + in_21; + in_12 = in_11 + in_12; + + vstore8(in_21, 0, output + offset_21); + vstore8(in_22, 0, output + offset_22); + vstore8(in_12, 0, output + offset_12); + } +} + +#ifndef OPERATOR +#define OPERATOR in0+in1 +#endif + +__kernel void binary_function_buf(__private int global_dim0, __private int global_dim1, + __global FLOAT* input0, __global FLOAT* input1, __global FLOAT* output, + __private const int4 baseOffsets,//[offsetA, offsetB, offsetC, 0] + __private const int4 strides//[strideA, strideB, strideC, 0] +) { + int2 pos = (int2)(get_global_id(0), get_global_id(1));// [X/16, Y] + + if (pos.x < global_dim0 && pos.y < global_dim1) { + const int baseOffsetA = baseOffsets.x; + const int baseOffsetB = baseOffsets.y; + const int baseOffsetC = baseOffsets.z; + const int strideA = strides.x; + const int strideB = strides.y; + const int strideC = strides.z; + + + int offsetA = pos.x * 8 + pos.y * VEC_H * strideA + baseOffsetA; + int offsetB = pos.x * 8 + pos.y * VEC_H * strideB + baseOffsetB; + int offsetC = pos.x * 8 + pos.y * VEC_H * strideC + baseOffsetC; + + { + FLOAT8 in0 = vload8(0, input0 + offsetA); + FLOAT8 in1 = vload8(0, input1 + offsetB); + FLOAT8 out = OPERATOR; + vstore8(out, 0, output + offsetC); + } + #if VEC_H >= 2 + { + offsetA += strideA; + offsetB += strideB; + offsetC += strideC; + FLOAT8 in0 = vload8(0, input0 + offsetA); + FLOAT8 in1 = vload8(0, input1 + offsetB); + FLOAT8 out = OPERATOR; + vstore8(out, 0, output + offsetC); + } + #endif + #if VEC_H == 4 + { + offsetA += strideA; + offsetB += strideB; + offsetC += strideC; + FLOAT8 in0 = vload8(0, input0 + offsetA); + FLOAT8 in1 = vload8(0, input1 + offsetB); + FLOAT8 out = OPERATOR; + vstore8(out, 0, output + offsetC); + } + { + offsetA += strideA; + offsetB += strideB; + offsetC += strideC; + FLOAT8 in0 = vload8(0, input0 + offsetA); + FLOAT8 in1 = vload8(0, input1 + offsetB); + FLOAT8 out = OPERATOR; + vstore8(out, 0, output + offsetC); + } + #endif + } +} diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp index f83de1223..d5315ffee 100644 --- a/source/backend/opencl/execution/image/ConvExecution.cpp +++ b/source/backend/opencl/execution/image/ConvExecution.cpp @@ -93,7 +93,7 @@ ConvExecution::ConvExecution(const std::vector &inputs, const std::vec std::shared_ptr quanCommon; if (nullptr != conv2dParams->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2dParams, backend, true); + quanCommon = ConvolutionCommon::load(op, backend, true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", op->name()->c_str()); } diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp index 830e6737d..f40f3d644 100644 --- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp +++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp @@ -12,7 +12,7 @@ namespace OpenCL { // set mDequantScale mDequantOffset mNumQuantBit mFilterDataPtr from mConv2dParams void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr & quanCommon) { - quanCommon = ConvolutionCommon::load(mResource->mConv2dParams, this->backend(), false, true); + quanCommon = ConvolutionCommon::load(mOp, this->backend(), false, true); if (mResource->mConv2dParams->quanParameter() != nullptr) { mLowMemoryFlag = true; } else { @@ -24,6 +24,7 @@ void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptrcanUseInt4){ mNumQuantBit = 4; + mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel); }else{ mNumQuantBit = 8; } @@ -71,58 +72,100 @@ void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptrweight.get(); } + +bool ConvLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, int icPack, int ocPack) { +#ifdef LOG_VERBOSE + MNN_PRINT("start convertToQuantWeight1x1Buffer !\n"); +#endif + auto runtime = mOpenCLBackend->getOpenCLRuntime(); + std::string kernelName = "conv2d_1x1_ic_oc_weight_quant_buffer"; + std::set buildOptions; + if (mNumQuantBit == 8) { + buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT8"); + } else if (mNumQuantBit == 4){ + // int4 case + buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT4"); + } else {/* More types to be supported. */} + if(mResource->mInputChannel % icPack != 0){ + buildOptions.emplace("-DCHANNEL_LEAVE"); + } + + mBufferToConv1x1Kernel = runtime->buildKernelWithCache("buffer_convert_quant", kernelName, buildOptions); + auto kernel = mBufferToConv1x1Kernel->get(); + uint32_t gws[2] = {static_cast(UP_DIV(mResource->mInputChannel, icPack)), static_cast(UP_DIV(mResource->mOutputChannel, ocPack))}; + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= kernel.setArg(idx++, gws[0]); + ret |= kernel.setArg(idx++, gws[1]); + ret |= kernel.setArg(idx++, input); + ret |= kernel.setArg(idx++, *mResource->mKernelBuffer.get()); + ret |= kernel.setArg(idx++, mResource->mInputChannel); + ret |= kernel.setArg(idx++, mResource->mOutputChannel); + ret |= kernel.setArg(idx++, icPack); + ret |= kernel.setArg(idx++, ocPack); + MNN_CHECK_CL_SUCCESS(ret, "setArg convertToQuantWeight1x1Buffer"); + + const uint32_t maxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(mBufferToConv1x1Kernel)); + const std::vector lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16)}; + + cl::Event event; + cl_int res; + + std::vector roundUpGroupWorkSize(lws.size()); + for (size_t i = 0; i < lws.size(); ++i) { + roundUpGroupWorkSize[i] = ROUND_UP(gws[i], lws[i]); + } + + res = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange, + cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + + event.wait(); + MNN_CHECK_CL_SUCCESS(res, "convertToQuantWeight1x1Buffer"); + +#ifdef LOG_VERBOSE + MNN_PRINT("end convertToQuantWeight1x1Buffer !\n"); +#endif + return true; +} + // set mKernelBuffer for the 1x1 kernels void ConvLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr & quanCommon) { cl_int res; - std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, 8)/*Cout pack set to max 8*/, ROUND_UP(mResource->mInputChannel, packCin), mResource->mKernelWidth, mResource->mKernelHeight})); + std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, packCout)/*Cout pack set to max 8*/, ROUND_UP(mResource->mInputChannel, packCin), 1, 1})); size_t buffer_size = filterBuffer->usize() / sizeof(float); + size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel; float *dequantAlpha = quanCommon->alpha.get(); // shared part for all cases - if (mNumQuantBit == 8) { - // int8 case - buffer_size *= sizeof(int8_t); - } else if (mNumQuantBit == 4){ + if (mNumQuantBit == 4){ // int4 case buffer_size /= 2; + cpy_size = UP_DIV(cpy_size, 2); } else {/* More types to be supported. */} - mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size)); - auto kernelBufferPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*(mResource->mKernelBuffer.get()), true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); - if(kernelBufferPtr != nullptr && res == CL_SUCCESS){ - ::memset(kernelBufferPtr, 0, buffer_size); - - - for(int o = 0; o < mResource->mOutputChannel; o++){ - int i = 0; - for(; i < mResource->mInputChannel; i++){ - int bufferIdx = (o/packCout) * packCin*packCout + (i/packCin)*packCin*ROUND_UP(mResource->mOutputChannel, packCout) + (i%packCin)*packCout + (o%packCout);//(Ci/packCin, Co/packCout, packCin, packCout) - int filterIdx = o*mResource->mInputChannel + i; - if (mNumQuantBit == 8) { - // int8 case - ((int8_t *)kernelBufferPtr)[bufferIdx] = (int8_t)(((int8_t *)filterDataPtr)[filterIdx]); - } else if (mNumQuantBit == 4){ - // int4 case - if (bufferIdx % 2 == 0) { - ((uint8_t *)kernelBufferPtr)[bufferIdx / 2] += (uint8_t)((((int8_t *)filterDataPtr)[filterIdx] + 8) * 16); - } else { - ((uint8_t *)kernelBufferPtr)[bufferIdx / 2] += (uint8_t)(((int8_t *)filterDataPtr)[filterIdx] + 8); - } - } else {/* More types to be supported. */} - } - } + cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size); + void *mapPtr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); + if(mapPtr != nullptr && res == CL_SUCCESS){ + ::memcpy(mapPtr, filterDataPtr, cpy_size); } else { MNN_ERROR("set1x1WeightLowMemory: Map error ptrCL == nullptr \n"); MNN_ASSERT(false); } - mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*(mResource->mKernelBuffer.get()), kernelBufferPtr); + mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(filterBufferCL, mapPtr); + + mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size)); + convertToQuantWeight1x1Buffer(filterBufferCL, packCin, packCout); } // set mFilter for the general kernels void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std::shared_ptr & quanCommon) { if (filterDataPtr != nullptr) { - std::vector filterImageShape{ROUND_UP(mResource->mInputChannel, 4), (UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight)}; - std::shared_ptr filterBuffer(Tensor::createDevice({mResource->mOutputChannel, ROUND_UP(mResource->mInputChannel, 4), mResource->mKernelWidth, mResource->mKernelHeight})); - // int buffer_size = filterBuffer->elementSize(); + std::shared_ptr filterBuffer(Tensor::createDevice({ROUND_UP(mResource->mOutputChannel, 4), mResource->mInputChannel, mResource->mKernelWidth, mResource->mKernelHeight})); size_t buffer_size = filterBuffer->usize() / sizeof(float); - buffer_size *= sizeof(int8_t); + size_t cpy_size = mResource->mOutputChannel * mResource->mInputChannel * mResource->mKernelWidth * mResource->mKernelHeight; + if (mNumQuantBit == 4){ + buffer_size /= 2; + cpy_size = UP_DIV(cpy_size, 2); + } cl::Buffer filterBufferCL(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size); filterBuffer->buffer().device = (uint64_t)(&filterBufferCL); float *dequantAlpha = quanCommon->alpha.get(); @@ -130,14 +173,7 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std: cl_int res; auto ptrCL = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(filterBufferCL, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res); if(ptrCL != nullptr && res == CL_SUCCESS) { - ::memset(ptrCL, 0, buffer_size); - const int copy_size = mResource->mKernelWidth * mResource->mKernelHeight * sizeof(int8_t); - for(int oc=0; ocmOutputChannel; oc++) { - int ic = 0; - for(; icmInputChannel; ic++) { - ::memcpy((int8_t *)ptrCL + (oc * ROUND_UP(mResource->mInputChannel, 4) + ic) * mResource->mKernelWidth * mResource->mKernelHeight, ((int8_t *)filterDataPtr) + (oc * mResource->mInputChannel + ic) * mResource->mKernelWidth * mResource->mKernelHeight, copy_size); - } - } + ::memcpy(ptrCL, filterDataPtr, cpy_size); } else { MNN_ERROR("setGeneralWeightLowMemory: Map error ptrCL == nullptr \n"); } @@ -145,7 +181,7 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std: // convert to NC4HW4 if (mNumQuantBit == 8) { // ROUND_UP(IC, 4), UP_DIV(OC, 4) * mKernelWidth * mKernelHeight - mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 4 * filterImageShape[0]})); + mResource->mFilter.reset(Tensor::createDevice({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 4 * ROUND_UP(mResource->mInputChannel, 4)})); mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size)); mResource->mFilter->buffer().device = (uint64_t)(mResource->mKernelBuffer.get()); MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()}; @@ -156,8 +192,8 @@ void ConvLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, std: // For int4 case, data stored in mFilter should be uint8_t // while "Tensor::createDevice" occupies more memory than "Tensor::createDevice". // Therefore, we use "Tensor::createDevice" currently, leaving "Tensor::createDevice" to be supported. - mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 2 * filterImageShape[0]})); - mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size/2)); + mResource->mFilter.reset(Tensor::createDevice({1, UP_DIV(mResource->mOutputChannel, 4) * mResource->mKernelWidth * mResource->mKernelHeight, 1, 2 * ROUND_UP(mResource->mInputChannel, 4)})); + mResource->mKernelBuffer.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size)); mResource->mFilter->buffer().device = (uint64_t)(mResource->mKernelBuffer.get()); MNN::OpenCL::BufferConvertor bufferConvertor{mOpenCLBackend->getOpenCLRuntime()}; // filterBuffer shape: {OC, ROUND_UP(IC, 4), mKernelWidth, mKernelHeight} diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp index af6d7d897..b7e22c41f 100644 --- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp +++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.hpp @@ -30,6 +30,7 @@ class ConvLowMemoryExecution : public ConvCommonExecution, public CommonExecutio void tune1x1CaseLowMemory(Tensor * input, Tensor * output); void tuneGeneralCaseLowMemory(Tensor * input, Tensor * output); void tuneGemmLowMemory(Tensor * input, Tensor * output); + bool convertToQuantWeight1x1Buffer(cl::Buffer input, int icPack, int ocPack); std::vector mPaddings{0, 0}; std::vector mGlobalWorkSize{1, 1, 1}; std::vector mLocalWorkSize{1, 1, 1, 1}; @@ -37,6 +38,7 @@ class ConvLowMemoryExecution : public ConvCommonExecution, public CommonExecutio void *mFilterDataPtr = nullptr; bool mLowMemoryFlag = false; int mNumQuantBit = 0; + std::shared_ptr mBufferToConv1x1Kernel = nullptr; }; } // namespace OpenCL diff --git a/source/backend/opencl/execution/image/ConvWinograd.cpp b/source/backend/opencl/execution/image/ConvWinograd.cpp index 25f793286..b6b7e2442 100644 --- a/source/backend/opencl/execution/image/ConvWinograd.cpp +++ b/source/backend/opencl/execution/image/ConvWinograd.cpp @@ -68,7 +68,7 @@ ConvWinograd::ConvWinograd(const MNN::Op *op, Backend* backend) : CommonExecutio std::shared_ptr quanCommon; if (nullptr != conv2D->quanParameter()) { - quanCommon = ConvolutionCommon::load(conv2D, backend, true); + quanCommon = ConvolutionCommon::load(op, backend, true); if (nullptr == quanCommon) { MNN_ERROR("Memory not Enough, can't extract IDST Convolution \n"); } diff --git a/source/backend/opencl/execution/image/DeconvExecution.cpp b/source/backend/opencl/execution/image/DeconvExecution.cpp index b16abb3ec..d9ee162b1 100644 --- a/source/backend/opencl/execution/image/DeconvExecution.cpp +++ b/source/backend/opencl/execution/image/DeconvExecution.cpp @@ -28,7 +28,7 @@ DeconvExecution::DeconvExecution(const std::vector &inputs, const MNN: const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, conv2dParams, &filterDataPtr, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &weightSize); int inputChannel = weightSize / (kernelWidth * kernelHeight * outputChannel); std::vector filterShape{outputChannel, inputChannel, kernelHeight, kernelWidth}; diff --git a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp index dbdbd1bbd..d81a67d78 100644 --- a/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp +++ b/source/backend/opencl/execution/image/DepthwiseConvExecution.cpp @@ -37,7 +37,7 @@ DepthwiseConvExecution::DepthwiseConvExecution(const std::vector &inpu const float* filterDataPtr = nullptr; int filterDataSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &filterDataSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &filterDataSize); mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 4 * filterImageShape[0]})); std::shared_ptr filterBuffer(Tensor::createDevice(filterShape)); diff --git a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp index da85385a7..0ddaf7f0a 100644 --- a/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp +++ b/source/backend/opencl/execution/image/DepthwiseDeconvExecution.cpp @@ -33,7 +33,7 @@ DepthwiseDeconvExecution::DepthwiseDeconvExecution(const std::vector & const float* filterDataPtr = nullptr; int tempWeightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend, mResource->mConv2dParams, &filterDataPtr, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend, op, &filterDataPtr, &tempWeightSize); mResource->mFilter.reset(Tensor::createDevice({1, filterImageShape[1], 1, 4 * filterImageShape[0]})); std::shared_ptr filterBuffer(Tensor::createDevice(filterShape)); diff --git a/source/backend/opencl/schema/CLCache.fbs b/source/backend/opencl/schema/CLCache.fbs index d53ce263f..958bbbb0c 100644 --- a/source/backend/opencl/schema/CLCache.fbs +++ b/source/backend/opencl/schema/CLCache.fbs @@ -29,11 +29,17 @@ table GemmInfo { paramInfo:[uint]; } +table PreParamInfo{ + preParamName:string; + preParamData:uint; +} + table Cache { programs:[Shader]; tunings:[Autotuning]; tuned:[OpInfo]; gemm:[GemmInfo]; + preParam:[PreParamInfo]; } root_type Cache; diff --git a/source/backend/opencl/schema/current/CLCache_generated.h b/source/backend/opencl/schema/current/CLCache_generated.h index 5918d6049..1fbe47226 100644 --- a/source/backend/opencl/schema/current/CLCache_generated.h +++ b/source/backend/opencl/schema/current/CLCache_generated.h @@ -23,6 +23,9 @@ struct AutotuningT; struct GemmInfo; struct GemmInfoT; +struct PreParamInfo; +struct PreParamInfoT; + struct Cache; struct CacheT; @@ -36,6 +39,8 @@ inline const flatbuffers::TypeTable *AutotuningTypeTable(); inline const flatbuffers::TypeTable *GemmInfoTypeTable(); +inline const flatbuffers::TypeTable *PreParamInfoTypeTable(); + inline const flatbuffers::TypeTable *CacheTypeTable(); struct TensorInfoT : public flatbuffers::NativeTable { @@ -420,12 +425,78 @@ inline flatbuffers::Offset CreateGemmInfo( flatbuffers::Offset CreateGemmInfo(flatbuffers::FlatBufferBuilder &_fbb, const GemmInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct PreParamInfoT : public flatbuffers::NativeTable { + typedef PreParamInfo TableType; + std::string preParamName; + uint32_t preParamData; + PreParamInfoT() + : preParamData(0) { + } +}; + +struct PreParamInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef PreParamInfoT NativeTableType; + static const flatbuffers::TypeTable *MiniReflectTypeTable() { + return PreParamInfoTypeTable(); + } + const flatbuffers::String *preParamName() const { + return GetPointer(4); + } + uint32_t preParamData() const { + return GetField(6, 0); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, 4) && + verifier.VerifyString(preParamName()) && + VerifyField(verifier, 6) && + verifier.EndTable(); + } + PreParamInfoT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(PreParamInfoT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct PreParamInfoBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_preParamName(flatbuffers::Offset preParamName) { + fbb_.AddOffset(4, preParamName); + } + void add_preParamData(uint32_t preParamData) { + fbb_.AddElement(6, preParamData, 0); + } + explicit PreParamInfoBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + PreParamInfoBuilder &operator=(const PreParamInfoBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreatePreParamInfo( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset preParamName = 0, + uint32_t preParamData = 0) { + PreParamInfoBuilder builder_(_fbb); + builder_.add_preParamData(preParamData); + builder_.add_preParamName(preParamName); + return builder_.Finish(); +} + +flatbuffers::Offset CreatePreParamInfo(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct CacheT : public flatbuffers::NativeTable { typedef Cache TableType; std::vector> programs; std::vector> tunings; std::vector> tuned; std::vector> gemm; + std::vector> preParam; CacheT() { } }; @@ -447,6 +518,9 @@ struct Cache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const flatbuffers::Vector> *gemm() const { return GetPointer> *>(10); } + const flatbuffers::Vector> *preParam() const { + return GetPointer> *>(12); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, 4) && @@ -461,6 +535,9 @@ struct Cache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, 10) && verifier.VerifyVector(gemm()) && verifier.VerifyVectorOfTables(gemm()) && + VerifyOffset(verifier, 12) && + verifier.VerifyVector(preParam()) && + verifier.VerifyVectorOfTables(preParam()) && verifier.EndTable(); } CacheT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -483,6 +560,9 @@ struct CacheBuilder { void add_gemm(flatbuffers::Offset>> gemm) { fbb_.AddOffset(10, gemm); } + void add_preParam(flatbuffers::Offset>> preParam) { + fbb_.AddOffset(12, preParam); + } explicit CacheBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -500,8 +580,10 @@ inline flatbuffers::Offset CreateCache( flatbuffers::Offset>> programs = 0, flatbuffers::Offset>> tunings = 0, flatbuffers::Offset>> tuned = 0, - flatbuffers::Offset>> gemm = 0) { + flatbuffers::Offset>> gemm = 0, + flatbuffers::Offset>> preParam = 0) { CacheBuilder builder_(_fbb); + builder_.add_preParam(preParam); builder_.add_gemm(gemm); builder_.add_tuned(tuned); builder_.add_tunings(tunings); @@ -671,6 +753,35 @@ inline flatbuffers::Offset CreateGemmInfo(flatbuffers::FlatBufferBuild _paramInfo); } +inline PreParamInfoT *PreParamInfo::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new PreParamInfoT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void PreParamInfo::UnPackTo(PreParamInfoT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = preParamName(); if (_e) _o->preParamName = _e->str(); }; + { auto _e = preParamData(); _o->preParamData = _e; }; +} + +inline flatbuffers::Offset PreParamInfo::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreatePreParamInfo(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreatePreParamInfo(flatbuffers::FlatBufferBuilder &_fbb, const PreParamInfoT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PreParamInfoT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _preParamName = _o->preParamName.empty() ? 0 : _fbb.CreateString(_o->preParamName); + auto _preParamData = _o->preParamData; + return CLCache::CreatePreParamInfo( + _fbb, + _preParamName, + _preParamData); +} + inline CacheT *Cache::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new CacheT(); UnPackTo(_o, _resolver); @@ -684,6 +795,7 @@ inline void Cache::UnPackTo(CacheT *_o, const flatbuffers::resolver_function_t * { auto _e = tunings(); if (_e) { _o->tunings.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tunings[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; { auto _e = tuned(); if (_e) { _o->tuned.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tuned[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; { auto _e = gemm(); if (_e) { _o->gemm.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->gemm[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; + { auto _e = preParam(); if (_e) { _o->preParam.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->preParam[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; } inline flatbuffers::Offset Cache::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CacheT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -698,12 +810,14 @@ inline flatbuffers::Offset CreateCache(flatbuffers::FlatBufferBuilder &_f auto _tunings = _o->tunings.size() ? _fbb.CreateVector> (_o->tunings.size(), [](size_t i, _VectorArgs *__va) { return CreateAutotuning(*__va->__fbb, __va->__o->tunings[i].get(), __va->__rehasher); }, &_va ) : 0; auto _tuned = _o->tuned.size() ? _fbb.CreateVector> (_o->tuned.size(), [](size_t i, _VectorArgs *__va) { return CreateOpInfo(*__va->__fbb, __va->__o->tuned[i].get(), __va->__rehasher); }, &_va ) : 0; auto _gemm = _o->gemm.size() ? _fbb.CreateVector> (_o->gemm.size(), [](size_t i, _VectorArgs *__va) { return CreateGemmInfo(*__va->__fbb, __va->__o->gemm[i].get(), __va->__rehasher); }, &_va ) : 0; + auto _preParam = _o->preParam.size() ? _fbb.CreateVector> (_o->preParam.size(), [](size_t i, _VectorArgs *__va) { return CreatePreParamInfo(*__va->__fbb, __va->__o->preParam[i].get(), __va->__rehasher); }, &_va ) : 0; return CLCache::CreateCache( _fbb, _programs, _tunings, _tuned, - _gemm); + _gemm, + _preParam); } inline const flatbuffers::TypeTable *TensorInfoTypeTable() { @@ -794,27 +908,45 @@ inline const flatbuffers::TypeTable *GemmInfoTypeTable() { return &tt; } +inline const flatbuffers::TypeTable *PreParamInfoTypeTable() { + static const flatbuffers::TypeCode type_codes[] = { + { flatbuffers::ET_STRING, 0, -1 }, + { flatbuffers::ET_UINT, 0, -1 } + }; + static const char * const names[] = { + "preParamName", + "preParamData" + }; + static const flatbuffers::TypeTable tt = { + flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, names + }; + return &tt; +} + inline const flatbuffers::TypeTable *CacheTypeTable() { static const flatbuffers::TypeCode type_codes[] = { { flatbuffers::ET_SEQUENCE, 1, 0 }, { flatbuffers::ET_SEQUENCE, 1, 1 }, { flatbuffers::ET_SEQUENCE, 1, 2 }, - { flatbuffers::ET_SEQUENCE, 1, 3 } + { flatbuffers::ET_SEQUENCE, 1, 3 }, + { flatbuffers::ET_SEQUENCE, 1, 4 } }; static const flatbuffers::TypeFunction type_refs[] = { ShaderTypeTable, AutotuningTypeTable, OpInfoTypeTable, - GemmInfoTypeTable + GemmInfoTypeTable, + PreParamInfoTypeTable }; static const char * const names[] = { "programs", "tunings", "tuned", - "gemm" + "gemm", + "preParam" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, names + flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, names }; return &tt; } diff --git a/source/backend/tensorrt/execution/TRTConvolution.cpp b/source/backend/tensorrt/execution/TRTConvolution.cpp index 109b24cb1..831dae85a 100644 --- a/source/backend/tensorrt/execution/TRTConvolution.cpp +++ b/source/backend/tensorrt/execution/TRTConvolution.cpp @@ -34,7 +34,7 @@ std::vector TRTConvolution::onEncode(const std::vector &xO int weightSize = 0; std::shared_ptr quanWeight; if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) { - quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true); + quanWeight = ConvolutionCommon::load(mOp, backend(), true); srcCount = quanWeight->weightFloat.size() / (outputCount * kernelX * kernelY); source = quanWeight->weightFloat.get(); weightSize = quanWeight->weightFloat.size(); diff --git a/source/backend/tensorrt/execution/TRTDeconvolution.cpp b/source/backend/tensorrt/execution/TRTDeconvolution.cpp index 835a17b86..45b7d3a4c 100755 --- a/source/backend/tensorrt/execution/TRTDeconvolution.cpp +++ b/source/backend/tensorrt/execution/TRTDeconvolution.cpp @@ -35,7 +35,7 @@ std::vector TRTDeconvolution::onEncode(const std::vector & int weightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend(), mOp, &source, &weightSize); nvinfer1::DimsHW NVKSize(kernelY, kernelX); nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX()); @@ -56,7 +56,7 @@ std::vector TRTDeconvolution::onEncode(const std::vector & if (conv2DCommon->padMode() == PadMode_SAME) { conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); - } + } conv_layer->setName(mOp->name()->str().c_str()); auto relu = conv2DCommon->relu(); auto relu6 = conv2DCommon->relu6(); diff --git a/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp b/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp index e2beeb066..194bc0cad 100644 --- a/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp +++ b/source/backend/tensorrt/execution/TRTDepthwiseConvolution.cpp @@ -36,7 +36,7 @@ std::vector TRTDepthwiseConvolution::onEncode(const std::vector quanWeight; if (nullptr != mOp->main_as_Convolution2D()->quanParameter()) { - quanWeight = ConvolutionCommon::load(mOp->main_as_Convolution2D(), backend(), true); + quanWeight = ConvolutionCommon::load(mOp, backend(), true); source = quanWeight->weightFloat.get(); weightSize = quanWeight->weightFloat.size(); } else { @@ -61,7 +61,7 @@ std::vector TRTDepthwiseConvolution::onEncode(const std::vectorsetPadding(nvinfer1::DimsHW{pads.second, pads.first}); if (conv2DCommon->padMode() == PadMode_SAME) { conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); - } + } conv_layer->setName(mOp->name()->str().c_str()); auto relu = conv2DCommon->relu(); auto relu6 = conv2DCommon->relu6(); diff --git a/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp b/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp index fe620a824..4e0fae803 100755 --- a/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp +++ b/source/backend/tensorrt/execution/TRTDepthwiseDeconvolution.cpp @@ -33,9 +33,9 @@ std::vector TRTDepthwiseDeconvolution::onEncode(const std::vectoroutputCount(); const float *source = nullptr; int weightSize = 0; - + std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, backend(), conv2D, &source, &weightSize); + ConvolutionCommon::getConvParameters(&quanCommon, backend(), mOp, &source, &weightSize); nvinfer1::DimsHW NVKSize(kernelY, kernelX); nvinfer1::DimsHW NVKSSize(conv2DCommon->strideY(), conv2DCommon->strideX()); @@ -56,7 +56,7 @@ std::vector TRTDepthwiseDeconvolution::onEncode(const std::vectorpadMode() == PadMode_SAME) { conv_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); - } + } conv_layer->setName(mOp->name()->str().c_str()); auto relu = conv2DCommon->relu(); auto relu6 = conv2DCommon->relu6(); diff --git a/source/backend/vulkan/CMakeLists.txt b/source/backend/vulkan/CMakeLists.txt index 9504a1ec2..555738aa7 100644 --- a/source/backend/vulkan/CMakeLists.txt +++ b/source/backend/vulkan/CMakeLists.txt @@ -43,5 +43,5 @@ else() endif() if (CMAKE_SYSTEM_NAME MATCHES "^Android") - add_definitions(-DVK_USE_PLATFORM_ANDROID_KHR) + add_definitions(-DVK_USE_PLATFORM_ANDROID_KHR) endif() diff --git a/source/backend/vulkan/buffer/backend/VulkanBackend.cpp b/source/backend/vulkan/buffer/backend/VulkanBackend.cpp index 7ddb8c713..34f992374 100644 --- a/source/backend/vulkan/buffer/backend/VulkanBackend.cpp +++ b/source/backend/vulkan/buffer/backend/VulkanBackend.cpp @@ -17,9 +17,6 @@ #include "execution/VulkanBasicExecution.hpp" //#define MNN_OPEN_TIME_TRACE #include -#ifdef MNN_USE_NEON -#include -#endif #define MNN_OP_SUPPORT_LOG //#define MNN_VULKAN_DUMP_MEMORY_USAGE @@ -417,7 +414,7 @@ std::vector VulkanBackend::autoTunePipeline(const VulkanPipeline* pipe mRuntime->mDevice->getMaxComputeWorkGroupSize(maxGroups); std::vector lws_prefer(3, 1); - uint32_t min_cost = UINT_MAX; + float min_cost = -1.0f; while(lws[2] <= gws[2] && lws[2] <= maxGroups[2]) { lws[1] = 1; @@ -430,8 +427,8 @@ std::vector VulkanBackend::autoTunePipeline(const VulkanPipeline* pipe groupSize[2] = UP_DIV(gws[2], lws[2]); pipeline->changePipeline(lws); - int cost_time = (int)getPipelineTime(pipeline, des, groupSize); - if(cost_time < min_cost) { + auto cost_time = getPipelineTime(pipeline, des, groupSize); + if(cost_time < min_cost || min_cost < 0.0f) { min_cost = cost_time; lws_prefer[0] = lws[0]; lws_prefer[1] = lws[1]; diff --git a/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp b/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp index 87b1c85bf..ee8985122 100644 --- a/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp +++ b/source/backend/vulkan/buffer/execution/VulkanConvolution.cpp @@ -341,7 +341,7 @@ class VulkanConvolutionSlideWindowsInt8 : public VulkanConvolutionCommon { } } vkBn->copyToGPUBuffer(wscaleData.data(), res.mWeightScale->buffer(), ocC4 * 4 * 2 * sizeof(float), 0); - + // Build Pipeline // Create Pipeline std::vector convTypes{ @@ -428,9 +428,9 @@ class VulkanConvolutionCreator : public VulkanBackend::Creator { } } if (quan->buffer() && OpType_Convolution == op->type()) { - quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, false, true); + quanWeight = ConvolutionCommon::load(op, backend, false, true); } else { - quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true); + quanWeight = ConvolutionCommon::load(op, backend, true); } if (quanWeight->weight.get() != nullptr) { useInt8Conv = true; diff --git a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp index a79ea9d7c..59372ff91 100644 --- a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp +++ b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.cpp @@ -11,13 +11,14 @@ #include "core/TensorUtils.hpp" namespace MNN { static void _initKernelRegion() { - + } VulkanDeconvolution::VulkanDeconvolution(Backend* bn) : VulkanBasicExecution(bn) { // Donthing } -VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Convolution2D* conv, OpType type, bool multiInputs) { +VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Op* op, OpType type, bool multiInputs) { + auto conv = op->main_as_Convolution2D(); auto exeRes = new VulkanDeconvolution(bn); exeRes->mConvCommonOption = conv->common(); auto vkBn = (VulkanBackend*)bn; @@ -45,7 +46,7 @@ VulkanDeconvolution* VulkanDeconvolution::create(Backend* bn, const Convolution2 int tempWeightSize = 0; std::shared_ptr quanCommon; if (!multiInputs) { - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &tempWeight, &tempWeightSize); MNN_ASSERT(nullptr != tempWeight); if (0 >= ci) { ci = tempWeightSize / co / kw / kh; @@ -212,7 +213,7 @@ class VulkanDeconvolutionCreator : public VulkanBackend::Creator { public: virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { - return VulkanDeconvolution::create(backend, op->main_as_Convolution2D(), op->type(), inputs.size() > 1); + return VulkanDeconvolution::create(backend, op, op->type(), inputs.size() > 1); } }; diff --git a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp index 97484a35f..97bcbe88f 100644 --- a/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp +++ b/source/backend/vulkan/buffer/execution/VulkanDeconvolution.hpp @@ -17,7 +17,7 @@ class VulkanDeconvolution : public VulkanBasicExecution { virtual ~VulkanDeconvolution() { } - static VulkanDeconvolution* create(Backend* bn, const Convolution2D* conv, OpType type, bool multiInputs); + static VulkanDeconvolution* create(Backend* bn, const Op* op, OpType type, bool multiInputs); virtual ErrorCode onEncode(const std::vector& inputs, const std::vector& outputs, const VulkanCommandPool::Buffer* cmdBuffer) override; diff --git a/source/backend/vulkan/buffer/execution/VulkanUnary.cpp b/source/backend/vulkan/buffer/execution/VulkanUnary.cpp index de98e09ed..81d53c465 100644 --- a/source/backend/vulkan/buffer/execution/VulkanUnary.cpp +++ b/source/backend/vulkan/buffer/execution/VulkanUnary.cpp @@ -71,6 +71,8 @@ static std::string _getMidType(const Op* op) { SETTYPE(UnaryOpOperation_SQUARE, "SQUARE"); SETTYPE(UnaryOpOperation_LOG, "LOG"); SETTYPE(UnaryOpOperation_GELU, "GELU"); + // Since SPIR-V lacks a built-in erf (gauss error function) instruction and the existing shader implementation of GELU is essentially an approximation of erf, there is no need to add a new implementation of GELU_STANDARD. + SETTYPE(UnaryOpOperation_GELU_STANDARD, "GELU"); SETTYPE(UnaryOpOperation_TAN, "TAN"); SETTYPE(UnaryOpOperation_COS, "COS"); diff --git a/source/backend/vulkan/component/VulkanDevice.cpp b/source/backend/vulkan/component/VulkanDevice.cpp index 79175277a..06f1eb5f2 100644 --- a/source/backend/vulkan/component/VulkanDevice.cpp +++ b/source/backend/vulkan/component/VulkanDevice.cpp @@ -10,14 +10,13 @@ #include //#define MNN_VULKAN_PRINT_EXT namespace MNN { -VulkanDevice::VulkanDevice(std::shared_ptr instance, const std::vector& device_extensions) +VulkanDevice::VulkanDevice(std::shared_ptr instance) : mOwner(true), mInstance(instance), mQueueFamilyIndex(0), mPhysicalDevice(VK_NULL_HANDLE), mDevice(VK_NULL_HANDLE), mQueue(VK_NULL_HANDLE) { - MNN_ASSERT(mInstance->success()); // Find one GPU to use: // On Android, every GPU device is equal -- supporting // graphics/compute/present @@ -68,6 +67,23 @@ VulkanDevice::VulkanDevice(std::shared_ptr instance, const std:: mDeviceFeature.shaderStorageImageWriteWithoutFormat = VK_TRUE; //vkGetPhysicalDeviceFeatures(mPhysicalDevice, &mDeviceFeature); + // Set device extensions. + std::vector deviceExtensions; + std::vector deviceExtensionsToCheck = { + "VK_KHR_portability_subset" + }; + uint32_t availableDeviceExtensionCount = 0; + CALL_VK(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr, &availableDeviceExtensionCount, nullptr)); + std::vector availableDeviceExtensions(availableDeviceExtensionCount); + CALL_VK(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr, &availableDeviceExtensionCount, availableDeviceExtensions.data())); + for (uint32_t i = 0; i < availableDeviceExtensionCount; i++) { + for (uint32_t j = 0; j < deviceExtensionsToCheck.size(); j++) { + if (strcmp(availableDeviceExtensions[i].extensionName, deviceExtensionsToCheck[j]) == 0) { + deviceExtensions.push_back(deviceExtensionsToCheck[j]); + } + } + } + VkDeviceCreateInfo deviceCreateInfo{ /* .sType = */ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, /* .pNext = */ nullptr, @@ -76,11 +92,16 @@ VulkanDevice::VulkanDevice(std::shared_ptr instance, const std:: /* .pQueueCreateInfos = */ &queueCreateInfo, /* .enabledLayerCount = */ 0, /* .ppEnabledLayerNames = */ nullptr, - /* .enabledExtensionCount = */ static_cast(device_extensions.size()), - /* .ppEnabledExtensionNames = */ device_extensions.data(), + /* .enabledExtensionCount = */ static_cast(deviceExtensions.size()), + /* .ppEnabledExtensionNames = */ deviceExtensions.data(), /* .pEnabledFeatures = */ &mDeviceFeature, }; + mDevice = VK_NULL_HANDLE; CALL_VK(vkCreateDevice(mPhysicalDevice, &deviceCreateInfo, nullptr, &mDevice)); + if (VK_NULL_HANDLE == mDevice) { + MNN_ERROR("Can't create vk device\n"); + return; + } vkGetPhysicalDeviceProperties(mPhysicalDevice, &mDeviceProty); vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mMemoryProty); getDeviceQueue(mQueueFamilyIndex, 0, mQueue); diff --git a/source/backend/vulkan/component/VulkanDevice.hpp b/source/backend/vulkan/component/VulkanDevice.hpp index 32a9d42f8..7eae18dca 100644 --- a/source/backend/vulkan/component/VulkanDevice.hpp +++ b/source/backend/vulkan/component/VulkanDevice.hpp @@ -18,8 +18,7 @@ namespace MNN { class VulkanDevice : public NonCopyable { public: - explicit VulkanDevice(std::shared_ptr instance, - const std::vector& device_extensions = {}); + explicit VulkanDevice(std::shared_ptr instance); explicit VulkanDevice(std::shared_ptr instance, VkPhysicalDevice physicalDevice, VkDevice device, uint32_t queueFamilyIndex, VkQueue queue); virtual ~VulkanDevice(); diff --git a/source/backend/vulkan/component/VulkanInstance.cpp b/source/backend/vulkan/component/VulkanInstance.cpp index db2e5b2c7..4cee805fe 100644 --- a/source/backend/vulkan/component/VulkanInstance.cpp +++ b/source/backend/vulkan/component/VulkanInstance.cpp @@ -8,6 +8,7 @@ #include "backend/vulkan/component/VulkanInstance.hpp" #include +#include namespace MNN { VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) { @@ -20,17 +21,42 @@ VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) { /* .engineVersion = */ VK_MAKE_VERSION(1, 0, 0), /* .apiVersion = */ VK_MAKE_VERSION(1, 0, 0), }; - std::vector instance_extensions; + + // Set instance extensions. + std::vector instanceExtensions; + std::vector instanceExtensionsToCheck = { + VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME, + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME + }; + uint32_t availableInstanceExtensionCount = 0; + CALL_VK(vkEnumerateInstanceExtensionProperties(nullptr, &availableInstanceExtensionCount, nullptr)); + std::vector availableInstanceExtensions(availableInstanceExtensionCount); + CALL_VK(vkEnumerateInstanceExtensionProperties(nullptr, &availableInstanceExtensionCount, availableInstanceExtensions.data())); + for (uint32_t i = 0; i < availableInstanceExtensionCount; i++) { + for (uint32_t j = 0; j < instanceExtensionsToCheck.size(); j++) { + if (strcmp(availableInstanceExtensions[i].extensionName, instanceExtensionsToCheck[j]) == 0) { + instanceExtensions.push_back(instanceExtensionsToCheck[j]); + } + } + } + + // Set instanceCreateFlag. + auto it = std::find_if(instanceExtensions.begin(), instanceExtensions.end(), + [](const char* str) { return strcmp(str, VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME) == 0; }); + VkInstanceCreateFlags instanceCreateFlag = (it != instanceExtensions.end()) ? VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR : 0; + #ifdef MNN_VULKAN_DEBUG + MNN_PRINT("MNN_VULKAN_DEBUG is on.\n"); const std::vector validationLayers = { "VK_LAYER_KHRONOS_validation" }; #endif + // Create the Vulkan instance VkInstanceCreateInfo instanceCreateInfo{ /* .sType = */ VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, /* .pNext = */ nullptr, - /* .flags = */ 0, + /* .flags = */ instanceCreateFlag, /* .pApplicationInfo = */ &appInfo, #ifdef MNN_VULKAN_DEBUG /* .enabledLayerCount = */ 1, @@ -39,8 +65,8 @@ VulkanInstance::VulkanInstance() : mOwner(true), mInstance(VK_NULL_HANDLE) { /* .enabledLayerCount = */ 0, /* .ppEnabledLayerNames = */ nullptr, #endif - /* .enabledExtensionCount = */ static_cast(instance_extensions.size()), - /* .ppEnabledExtensionNames = */ instance_extensions.data(), + /* .enabledExtensionCount = */ static_cast(instanceExtensions.size()), + /* .ppEnabledExtensionNames = */ instanceExtensions.data(), }; CALL_VK(vkCreateInstance(&instanceCreateInfo, nullptr, &mInstance)); } @@ -65,6 +91,9 @@ void VulkanInstance::getPhysicalDeviceQueueFamilyProperties(const VkPhysicalDevi } const bool VulkanInstance::supportVulkan() const { + if (VK_NULL_HANDLE == mInstance) { + return false; + } uint32_t gpuCount = 0; auto res = enumeratePhysicalDevices(gpuCount, nullptr); if ((0 == gpuCount) || (VK_SUCCESS != res)) { diff --git a/source/backend/vulkan/component/VulkanInstance.hpp b/source/backend/vulkan/component/VulkanInstance.hpp index 4a6969896..cc57d0c88 100644 --- a/source/backend/vulkan/component/VulkanInstance.hpp +++ b/source/backend/vulkan/component/VulkanInstance.hpp @@ -28,11 +28,6 @@ class VulkanInstance : public NonCopyable { VkInstance get() const { return mInstance; } - - bool success() const { - return (VK_NULL_HANDLE != mInstance); - } - private: bool mOwner; VkInstance mInstance; diff --git a/source/backend/vulkan/component/VulkanPipeline.cpp b/source/backend/vulkan/component/VulkanPipeline.cpp index c73a6b8bf..e0da6bcdd 100644 --- a/source/backend/vulkan/component/VulkanPipeline.cpp +++ b/source/backend/vulkan/component/VulkanPipeline.cpp @@ -55,8 +55,9 @@ VulkanPipeline* VulkanPipelineFactory::createComputePipeline(const uint8_t* data VkPipeline pipeline; /*for localSize_x_id = 0,localSize_y_id = 1,localSize_z_id = 2*/ std::vector specializationMapEntry; /*localSize data description*/ - std::shared_ptr specializationInfo = std::make_shared(); + std::shared_ptr specializationInfo; if (localSize.size() > 0) { + specializationInfo = std::make_shared(); // FUNC_PRINT(localSize.size()); for (int i = 0; i < localSize.size(); i++) { VkSpecializationMapEntry entry = {(uint32_t)(i), (uint32_t)(sizeof(uint32_t) * i), diff --git a/source/backend/vulkan/component/VulkanQueryPool.cpp b/source/backend/vulkan/component/VulkanQueryPool.cpp index 82ef40a41..7cbe6a928 100644 --- a/source/backend/vulkan/component/VulkanQueryPool.cpp +++ b/source/backend/vulkan/component/VulkanQueryPool.cpp @@ -35,7 +35,7 @@ float VulkanQueryPool::VulkanGetQueryPoolResults(){ vkGetQueryPoolResults(mDevice.get(), queryPool, 0, 2, sizeof(uint64_t) * 2, timestamps, sizeof(uint64_t), VK_QUERY_RESULT_WAIT_BIT); float timestampPeriod = mDevice.getTimestampPeriod(); - float executionTime = (timestamps[1] - timestamps[0]) * timestampPeriod * 1e-3f; // 微妙 + float executionTime = (timestamps[1] - timestamps[0]) * timestampPeriod * 1e-3f; // us return executionTime; } } // namespace MNN diff --git a/source/backend/vulkan/image/backend/VulkanBackend.cpp b/source/backend/vulkan/image/backend/VulkanBackend.cpp index 0892a53b0..0663ceba6 100644 --- a/source/backend/vulkan/image/backend/VulkanBackend.cpp +++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp @@ -22,7 +22,7 @@ #ifdef MNN_USE_NEON #include #endif -//#define MNN_OP_SUPPORT_LOG +#define MNN_OP_SUPPORT_LOG //#define MNN_VULKAN_DUMP_MEMORY_USAGE #define MNN_VULKAN_MAX_CACHE_CONVSIZE 50 namespace MNN { @@ -89,9 +89,6 @@ const VulkanPipeline* VulkanBackend::getPipeline(const std::string& key, const s } bool VulkanBackend::_supportImageSize(const Tensor* MTensor) { - if (MTensor->getType().code != halide_type_float) { - return false; - } auto format = TensorUtils::getDescribe(MTensor)->dimensionFormat; if (format != MNN_DATA_FORMAT_NC4HW4) { return true; diff --git a/source/backend/vulkan/image/compiler/AllShader.cpp b/source/backend/vulkan/image/compiler/AllShader.cpp index 559166efc..e9b7860f1 100644 --- a/source/backend/vulkan/image/compiler/AllShader.cpp +++ b/source/backend/vulkan/image/compiler/AllShader.cpp @@ -229,18 +229,13 @@ const unsigned char glsl_deconvCol2Im_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x93, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x93, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x93, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x94, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -376,7 +371,7 @@ const unsigned char glsl_deconvCol2Im_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x5b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvCol2Im_comp_len = 2352; +unsigned int glsl_deconvCol2Im_comp_len = 2292; const unsigned char glsl_convolutionDepthwiseMali_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -431,31 +426,19 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -703,7 +686,7 @@ const unsigned char glsl_convolutionDepthwiseMali_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwiseMali_comp_len = 3868; +unsigned int glsl_convolutionDepthwiseMali_comp_len = 3724; const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -758,31 +741,19 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcc, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -1036,7 +1007,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3940; +unsigned int glsl_convolutionDepthwiseMali_RELU_comp_len = 3796; const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1091,31 +1062,19 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xce, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -1373,7 +1332,7 @@ const unsigned char glsl_convolutionDepthwiseMali_RELU6_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3988; +unsigned int glsl_convolutionDepthwiseMali_RELU6_comp_len = 3844; const unsigned char glsl_relu_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1408,19 +1367,13 @@ const unsigned char glsl_relu_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -1518,7 +1471,7 @@ const unsigned char glsl_relu_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_relu_comp_len = 1692; +unsigned int glsl_relu_comp_len = 1620; const unsigned char glsl_unaryImage_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1556,7 +1509,6 @@ const unsigned char glsl_unaryImage_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x3f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -1634,7 +1586,7 @@ const unsigned char glsl_unaryImage_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_comp_len = 1344; +unsigned int glsl_unaryImage_comp_len = 1332; const unsigned char glsl_unaryImage_SIGMOID_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1672,7 +1624,6 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x47, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -1761,7 +1712,7 @@ const unsigned char glsl_unaryImage_SIGMOID_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SIGMOID_comp_len = 1468; +unsigned int glsl_unaryImage_SIGMOID_comp_len = 1456; const unsigned char glsl_unaryImage_TANH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1799,7 +1750,6 @@ const unsigned char glsl_unaryImage_TANH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -1879,7 +1829,7 @@ const unsigned char glsl_unaryImage_TANH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_TANH_comp_len = 1368; +unsigned int glsl_unaryImage_TANH_comp_len = 1356; const unsigned char glsl_unaryImage_ABS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -1917,7 +1867,6 @@ const unsigned char glsl_unaryImage_ABS_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -1997,7 +1946,7 @@ const unsigned char glsl_unaryImage_ABS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ABS_comp_len = 1368; +unsigned int glsl_unaryImage_ABS_comp_len = 1356; const unsigned char glsl_unaryImage_SQRT_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2035,7 +1984,6 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2115,7 +2063,7 @@ const unsigned char glsl_unaryImage_SQRT_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SQRT_comp_len = 1368; +unsigned int glsl_unaryImage_SQRT_comp_len = 1356; const unsigned char glsl_unaryImage_RSQRT_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2153,7 +2101,6 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2233,7 +2180,7 @@ const unsigned char glsl_unaryImage_RSQRT_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_RSQRT_comp_len = 1368; +unsigned int glsl_unaryImage_RSQRT_comp_len = 1356; const unsigned char glsl_unaryImage_NEG_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2271,7 +2218,6 @@ const unsigned char glsl_unaryImage_NEG_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2351,7 +2297,7 @@ const unsigned char glsl_unaryImage_NEG_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_NEG_comp_len = 1360; +unsigned int glsl_unaryImage_NEG_comp_len = 1348; const unsigned char glsl_unaryImage_SQUARE_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2389,7 +2335,6 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x42, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2469,7 +2414,7 @@ const unsigned char glsl_unaryImage_SQUARE_comp[] = { 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SQUARE_comp_len = 1364; +unsigned int glsl_unaryImage_SQUARE_comp_len = 1352; const unsigned char glsl_unaryImage_EXP_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2507,7 +2452,6 @@ const unsigned char glsl_unaryImage_EXP_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2587,7 +2531,7 @@ const unsigned char glsl_unaryImage_EXP_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_EXP_comp_len = 1368; +unsigned int glsl_unaryImage_EXP_comp_len = 1356; const unsigned char glsl_unaryImage_SIGN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2625,7 +2569,6 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2705,7 +2648,7 @@ const unsigned char glsl_unaryImage_SIGN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SIGN_comp_len = 1368; +unsigned int glsl_unaryImage_SIGN_comp_len = 1356; const unsigned char glsl_unaryImage_LOG_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2743,7 +2686,6 @@ const unsigned char glsl_unaryImage_LOG_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2829,7 +2771,7 @@ const unsigned char glsl_unaryImage_LOG_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_LOG_comp_len = 1440; +unsigned int glsl_unaryImage_LOG_comp_len = 1428; const unsigned char glsl_unaryImage_TAN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2867,7 +2809,6 @@ const unsigned char glsl_unaryImage_TAN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -2947,7 +2888,7 @@ const unsigned char glsl_unaryImage_TAN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_TAN_comp_len = 1368; +unsigned int glsl_unaryImage_TAN_comp_len = 1356; const unsigned char glsl_unaryImage_COS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -2985,7 +2926,6 @@ const unsigned char glsl_unaryImage_COS_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3065,7 +3005,7 @@ const unsigned char glsl_unaryImage_COS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_COS_comp_len = 1368; +unsigned int glsl_unaryImage_COS_comp_len = 1356; const unsigned char glsl_unaryImage_SIN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3103,7 +3043,6 @@ const unsigned char glsl_unaryImage_SIN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3183,7 +3122,7 @@ const unsigned char glsl_unaryImage_SIN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SIN_comp_len = 1368; +unsigned int glsl_unaryImage_SIN_comp_len = 1356; const unsigned char glsl_unaryImage_CEIL_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3221,7 +3160,6 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3301,7 +3239,7 @@ const unsigned char glsl_unaryImage_CEIL_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_CEIL_comp_len = 1368; +unsigned int glsl_unaryImage_CEIL_comp_len = 1356; const unsigned char glsl_unaryImage_FLOOR_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3339,7 +3277,6 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3419,7 +3356,7 @@ const unsigned char glsl_unaryImage_FLOOR_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_FLOOR_comp_len = 1368; +unsigned int glsl_unaryImage_FLOOR_comp_len = 1356; const unsigned char glsl_unaryImage_EXPM1_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3457,7 +3394,6 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3543,7 +3479,7 @@ const unsigned char glsl_unaryImage_EXPM1_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_EXPM1_comp_len = 1432; +unsigned int glsl_unaryImage_EXPM1_comp_len = 1420; const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3581,7 +3517,6 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x43, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x43, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3665,7 +3600,7 @@ const unsigned char glsl_unaryImage_RECIPROCAL_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1408; +unsigned int glsl_unaryImage_RECIPROCAL_comp_len = 1396; const unsigned char glsl_unaryImage_SINH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3703,7 +3638,6 @@ const unsigned char glsl_unaryImage_SINH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3783,7 +3717,7 @@ const unsigned char glsl_unaryImage_SINH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_SINH_comp_len = 1368; +unsigned int glsl_unaryImage_SINH_comp_len = 1356; const unsigned char glsl_unaryImage_ASINH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3821,7 +3755,6 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -3901,7 +3834,7 @@ const unsigned char glsl_unaryImage_ASINH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ASINH_comp_len = 1368; +unsigned int glsl_unaryImage_ASINH_comp_len = 1356; const unsigned char glsl_unaryImage_ASIN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -3939,7 +3872,6 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4019,7 +3951,7 @@ const unsigned char glsl_unaryImage_ASIN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ASIN_comp_len = 1368; +unsigned int glsl_unaryImage_ASIN_comp_len = 1356; const unsigned char glsl_unaryImage_COSH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4057,7 +3989,6 @@ const unsigned char glsl_unaryImage_COSH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4137,7 +4068,7 @@ const unsigned char glsl_unaryImage_COSH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_COSH_comp_len = 1368; +unsigned int glsl_unaryImage_COSH_comp_len = 1356; const unsigned char glsl_unaryImage_ACOS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4175,7 +4106,6 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4255,7 +4185,7 @@ const unsigned char glsl_unaryImage_ACOS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ACOS_comp_len = 1368; +unsigned int glsl_unaryImage_ACOS_comp_len = 1356; const unsigned char glsl_unaryImage_ACOSH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4293,7 +4223,6 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4373,7 +4302,7 @@ const unsigned char glsl_unaryImage_ACOSH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ACOSH_comp_len = 1368; +unsigned int glsl_unaryImage_ACOSH_comp_len = 1356; const unsigned char glsl_unaryImage_ATAN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4411,7 +4340,6 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4491,7 +4419,7 @@ const unsigned char glsl_unaryImage_ATAN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ATAN_comp_len = 1368; +unsigned int glsl_unaryImage_ATAN_comp_len = 1356; const unsigned char glsl_unaryImage_ATANH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4529,7 +4457,6 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4609,7 +4536,7 @@ const unsigned char glsl_unaryImage_ATANH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ATANH_comp_len = 1368; +unsigned int glsl_unaryImage_ATANH_comp_len = 1356; const unsigned char glsl_unaryImage_LOG1P_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4647,7 +4574,6 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4733,7 +4659,7 @@ const unsigned char glsl_unaryImage_LOG1P_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_LOG1P_comp_len = 1432; +unsigned int glsl_unaryImage_LOG1P_comp_len = 1420; const unsigned char glsl_unaryImage_ROUND_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4771,7 +4697,6 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -4851,7 +4776,7 @@ const unsigned char glsl_unaryImage_ROUND_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_ROUND_comp_len = 1368; +unsigned int glsl_unaryImage_ROUND_comp_len = 1356; const unsigned char glsl_unaryImage_HARDSWISH_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -4889,7 +4814,6 @@ const unsigned char glsl_unaryImage_HARDSWISH_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x60, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x60, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -5005,7 +4929,7 @@ const unsigned char glsl_unaryImage_HARDSWISH_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1800; +unsigned int glsl_unaryImage_HARDSWISH_comp_len = 1788; const unsigned char glsl_unaryImage_GELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -5043,7 +4967,6 @@ const unsigned char glsl_unaryImage_GELU_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x58, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -5151,7 +5074,7 @@ const unsigned char glsl_unaryImage_GELU_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unaryImage_GELU_comp_len = 1696; +unsigned int glsl_unaryImage_GELU_comp_len = 1684; const unsigned char glsl_im2col_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -5204,19 +5127,13 @@ const unsigned char glsl_im2col_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0a, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0a, 0x01, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x0b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x1a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x1d, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1d, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1d, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1d, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1e, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x27, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -5527,7 +5444,7 @@ const unsigned char glsl_im2col_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_im2col_comp_len = 4464; +unsigned int glsl_im2col_comp_len = 4392; const unsigned char glsl_convolutionDepthwise_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -5582,31 +5499,19 @@ const unsigned char glsl_convolutionDepthwise_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc7, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -5854,7 +5759,7 @@ const unsigned char glsl_convolutionDepthwise_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwise_comp_len = 3868; +unsigned int glsl_convolutionDepthwise_comp_len = 3724; const unsigned char glsl_convolutionDepthwise_RELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -5909,31 +5814,19 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcc, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xde, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -6187,7 +6080,7 @@ const unsigned char glsl_convolutionDepthwise_RELU_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3940; +unsigned int glsl_convolutionDepthwise_RELU_comp_len = 3796; const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -6242,31 +6135,19 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xcd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcd, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xce, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -6524,7 +6405,7 @@ const unsigned char glsl_convolutionDepthwise_RELU6_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3988; +unsigned int glsl_convolutionDepthwise_RELU6_comp_len = 3844; const unsigned char glsl_gridSampleBilinear_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -6558,8 +6439,7 @@ const unsigned char glsl_gridSampleBilinear_comp[] = { 0x70, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00, 0x75, 0x47, 0x72, 0x69, 0x64, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x46, 0x01, 0x00, 0x00, - 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00, @@ -6574,28 +6454,14 @@ const unsigned char glsl_gridSampleBilinear_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x83, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xa2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x46, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x46, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x01, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x47, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x01, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x58, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x65, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x7f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x99, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb3, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x01, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, @@ -6849,7 +6715,7 @@ const unsigned char glsl_gridSampleBilinear_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x61, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x61, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gridSampleBilinear_comp_len = 3852; +unsigned int glsl_gridSampleBilinear_comp_len = 3672; const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -6883,8 +6749,7 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = { 0x70, 0x6c, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x75, 0x47, 0x72, 0x69, 0x64, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4f, 0x01, 0x00, 0x00, - 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00, @@ -6899,31 +6764,14 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xac, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x4f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4f, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4f, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4f, 0x01, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4f, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x50, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x01, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x68, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x75, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x97, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xac, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb9, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xce, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xdb, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x01, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, @@ -7248,7 +7096,7 @@ const unsigned char glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x6b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x6b, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp_len = 4740; +unsigned int glsl_gridSampleBilinear_PAD_MODE_ZEROS_comp_len = 4524; const unsigned char glsl_gridSampleNearest_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -7294,28 +7142,16 @@ const unsigned char glsl_gridSampleNearest_comp[] = { 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xce, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xce, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -7510,7 +7346,7 @@ const unsigned char glsl_gridSampleNearest_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gridSampleNearest_comp_len = 3096; +unsigned int glsl_gridSampleNearest_comp_len = 2952; const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -7556,28 +7392,16 @@ const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = { 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5d, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc4, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xd4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdd, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdd, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -7798,7 +7622,7 @@ const unsigned char glsl_gridSampleNearest_PAD_MODE_ZEROS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gridSampleNearest_PAD_MODE_ZEROS_comp_len = 3408; +unsigned int glsl_gridSampleNearest_PAD_MODE_ZEROS_comp_len = 3264; const unsigned char glsl_relu6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -7833,19 +7657,13 @@ const unsigned char glsl_relu6_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -7945,7 +7763,7 @@ const unsigned char glsl_relu6_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_relu6_comp_len = 1716; +unsigned int glsl_relu6_comp_len = 1644; const unsigned char glsl_binaryImage_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -7991,7 +7809,6 @@ const unsigned char glsl_binaryImage_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x70, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -8112,7 +7929,7 @@ const unsigned char glsl_binaryImage_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_comp_len = 1948; +unsigned int glsl_binaryImage_comp_len = 1936; const unsigned char glsl_binaryImage_ADD_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -8162,7 +7979,6 @@ const unsigned char glsl_binaryImage_ADD_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -8310,7 +8126,7 @@ const unsigned char glsl_binaryImage_ADD_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_ADD_comp_len = 2320; +unsigned int glsl_binaryImage_ADD_comp_len = 2308; const unsigned char glsl_binaryImage_SUB_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -8360,7 +8176,6 @@ const unsigned char glsl_binaryImage_SUB_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -8508,7 +8323,7 @@ const unsigned char glsl_binaryImage_SUB_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_SUB_comp_len = 2320; +unsigned int glsl_binaryImage_SUB_comp_len = 2308; const unsigned char glsl_binaryImage_MUL_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -8558,7 +8373,6 @@ const unsigned char glsl_binaryImage_MUL_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -8706,7 +8520,7 @@ const unsigned char glsl_binaryImage_MUL_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_MUL_comp_len = 2320; +unsigned int glsl_binaryImage_MUL_comp_len = 2308; const unsigned char glsl_binaryImage_DIV_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -8756,7 +8570,6 @@ const unsigned char glsl_binaryImage_DIV_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x7a, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -8915,7 +8728,7 @@ const unsigned char glsl_binaryImage_DIV_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_DIV_comp_len = 2460; +unsigned int glsl_binaryImage_DIV_comp_len = 2448; const unsigned char glsl_binaryImage_POW_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -8965,7 +8778,6 @@ const unsigned char glsl_binaryImage_POW_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -9113,7 +8925,7 @@ const unsigned char glsl_binaryImage_POW_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_POW_comp_len = 2328; +unsigned int glsl_binaryImage_POW_comp_len = 2316; const unsigned char glsl_binaryImage_VMAX_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -9163,7 +8975,6 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -9311,7 +9122,7 @@ const unsigned char glsl_binaryImage_VMAX_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_VMAX_comp_len = 2328; +unsigned int glsl_binaryImage_VMAX_comp_len = 2316; const unsigned char glsl_binaryImage_SQUDIFF_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -9361,7 +9172,6 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x77, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x77, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -9510,7 +9320,7 @@ const unsigned char glsl_binaryImage_SQUDIFF_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2340; +unsigned int glsl_binaryImage_SQUDIFF_comp_len = 2328; const unsigned char glsl_binaryImage_VMIN_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -9560,7 +9370,6 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -9708,7 +9517,7 @@ const unsigned char glsl_binaryImage_VMIN_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x25, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_binaryImage_VMIN_comp_len = 2328; +unsigned int glsl_binaryImage_VMIN_comp_len = 2316; const unsigned char glsl_matmul_input_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -9749,17 +9558,11 @@ const unsigned char glsl_matmul_input_comp[] = { 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8a, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8a, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa7, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -9941,7 +9744,7 @@ const unsigned char glsl_matmul_input_comp[] = { 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_input_comp_len = 2744; +unsigned int glsl_matmul_input_comp_len = 2672; const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -9982,17 +9785,11 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = { 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xab, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xab, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xab, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xac, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb6, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd2, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -10210,7 +10007,7 @@ const unsigned char glsl_matmul_input_TRANSPOSE_comp[] = { 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 3176; +unsigned int glsl_matmul_input_TRANSPOSE_comp_len = 3104; const unsigned char glsl_nchwToimage_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -10271,13 +10068,10 @@ const unsigned char glsl_nchwToimage_comp[] = { 0x70, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -10478,7 +10272,7 @@ const unsigned char glsl_nchwToimage_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_nchwToimage_comp_len = 3168; +unsigned int glsl_nchwToimage_comp_len = 3132; const unsigned char glsl_packAsImage4x4_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -10529,7 +10323,6 @@ const unsigned char glsl_packAsImage4x4_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe5, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -10740,7 +10533,7 @@ const unsigned char glsl_packAsImage4x4_comp[] = { 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_packAsImage4x4_comp_len = 3092; +unsigned int glsl_packAsImage4x4_comp_len = 3080; const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -10791,7 +10584,6 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x91, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x06, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -11038,7 +10830,7 @@ const unsigned char glsl_packAsImage4x4_TRANSPOSE_comp[] = { 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3524; +unsigned int glsl_packAsImage4x4_TRANSPOSE_comp_len = 3512; const unsigned char glsl_roipooling_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -11089,7 +10881,6 @@ const unsigned char glsl_roipooling_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x0b, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1e, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -11387,7 +11178,7 @@ const unsigned char glsl_roipooling_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_roipooling_comp_len = 4140; +unsigned int glsl_roipooling_comp_len = 4128; const unsigned char glsl_blit_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -11629,7 +11420,6 @@ const unsigned char glsl_blit_image_comp[] = { 0xc9, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc9, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc9, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, @@ -11832,7 +11622,7 @@ const unsigned char glsl_blit_image_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_blit_image_comp_len = 3004; +unsigned int glsl_blit_image_comp_len = 2992; const unsigned char glsl_fill_image_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -11870,7 +11660,6 @@ const unsigned char glsl_fill_image_comp[] = { 0x32, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, @@ -11942,7 +11731,7 @@ const unsigned char glsl_fill_image_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_fill_image_comp_len = 1272; +unsigned int glsl_fill_image_comp_len = 1260; const unsigned char glsl_imageTonchw_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -11987,12 +11776,9 @@ const unsigned char glsl_imageTonchw_comp[] = { 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, @@ -12198,7 +11984,7 @@ const unsigned char glsl_imageTonchw_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_imageTonchw_comp_len = 3024; +unsigned int glsl_imageTonchw_comp_len = 2988; const unsigned char glsl_softmaxHeight_NHWC_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -12460,19 +12246,13 @@ const unsigned char glsl_resizeNearest_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x87, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -12629,7 +12409,7 @@ const unsigned char glsl_resizeNearest_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_resizeNearest_comp_len = 2460; +unsigned int glsl_resizeNearest_comp_len = 2388; const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -12669,19 +12449,13 @@ const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8b, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -12843,7 +12617,7 @@ const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[] = { 0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_resizeNearest_NEAREST_ROUND_comp_len = 2516; +unsigned int glsl_resizeNearest_NEAREST_ROUND_comp_len = 2444; const unsigned char glsl_reduce_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -13889,22 +13663,13 @@ const unsigned char glsl_resizeBilinear_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x9d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xd0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xe7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe7, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe7, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xe8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -14120,7 +13885,7 @@ const unsigned char glsl_resizeBilinear_comp[] = { 0x33, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x33, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_resizeBilinear_comp_len = 3200; +unsigned int glsl_resizeBilinear_comp_len = 3092; const unsigned char glsl_nchwTonc4hw4_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -14879,18 +14644,13 @@ const unsigned char glsl_im2col1x1_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xae, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc3, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xcd, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -15090,7 +14850,7 @@ const unsigned char glsl_im2col1x1_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_im2col1x1_comp_len = 3112; +unsigned int glsl_im2col1x1_comp_len = 3052; const unsigned char glsl_avgpool_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -15142,7 +14902,6 @@ const unsigned char glsl_avgpool_comp[] = { 0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xab, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xab, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xab, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -15360,7 +15119,7 @@ const unsigned char glsl_avgpool_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_avgpool_comp_len = 3184; +unsigned int glsl_avgpool_comp_len = 3172; const unsigned char glsl_unPackImage4x4_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -15399,15 +15158,9 @@ const unsigned char glsl_unPackImage4x4_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x5f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x8c, 0x00, 0x00, 0x00, @@ -15586,7 +15339,7 @@ const unsigned char glsl_unPackImage4x4_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unPackImage4x4_comp_len = 2664; +unsigned int glsl_unPackImage4x4_comp_len = 2592; const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -15625,15 +15378,9 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x5f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xac, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0xad, 0x00, 0x00, 0x00, @@ -15848,7 +15595,7 @@ const unsigned char glsl_unPackImage4x4_TRANSPOSE_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3096; +unsigned int glsl_unPackImage4x4_TRANSPOSE_comp_len = 3024; const unsigned char glsl_maxpool_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -15900,7 +15647,6 @@ const unsigned char glsl_maxpool_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x9c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xaf, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -16102,7 +15848,7 @@ const unsigned char glsl_maxpool_comp[] = { 0x47, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_maxpool_comp_len = 2996; +unsigned int glsl_maxpool_comp_len = 2984; const unsigned char glsl_winogradTransformDest2_3_1_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -16175,7 +15921,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x73, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x73, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb1, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -16588,7 +16333,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5784; +unsigned int glsl_winogradTransformDest2_3_1_comp_len = 5772; const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -16661,7 +16406,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x77, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x77, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbb, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -17087,7 +16831,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5940; +unsigned int glsl_winogradTransformDest2_3_1_RELU_comp_len = 5928; const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -17160,7 +16904,6 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x79, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x79, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x79, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x79, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -17591,7 +17334,7 @@ const unsigned char glsl_winogradTransformDest2_3_1_RELU6_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 6000; +unsigned int glsl_winogradTransformDest2_3_1_RELU6_comp_len = 5988; const unsigned char glsl_winogradTransformSource2_3_1_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -17660,7 +17403,6 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = { 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x95, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x95, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x95, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x54, 0x02, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -18327,7 +18069,7 @@ const unsigned char glsl_winogradTransformSource2_3_1_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_winogradTransformSource2_3_1_comp_len = 8776; +unsigned int glsl_winogradTransformSource2_3_1_comp_len = 8764; const unsigned char glsl_col2Im_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -19104,13 +18846,10 @@ const unsigned char glsl_nc4hw4toimage_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6f, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x6f, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x74, 0x00, 0x00, 0x00, @@ -19242,7 +18981,7 @@ const unsigned char glsl_nc4hw4toimage_comp[] = { 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_nc4hw4toimage_comp_len = 2240; +unsigned int glsl_nc4hw4toimage_comp_len = 2204; const unsigned char glsl_imageTonc4hw4_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -19287,12 +19026,9 @@ const unsigned char glsl_imageTonc4hw4_comp[] = { 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x45, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, @@ -19434,7 +19170,7 @@ const unsigned char glsl_imageTonc4hw4_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_imageTonc4hw4_comp_len = 2256; +unsigned int glsl_imageTonc4hw4_comp_len = 2220; const unsigned char glsl_matmul_output_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -19475,17 +19211,11 @@ const unsigned char glsl_matmul_output_comp[] = { 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x81, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x81, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x81, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x81, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x90, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9d, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xaa, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -19658,7 +19388,7 @@ const unsigned char glsl_matmul_output_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_output_comp_len = 2632; +unsigned int glsl_matmul_output_comp_len = 2560; const unsigned char glsl_matmul_output_BIAS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -19703,17 +19433,11 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = { 0x36, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x96, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa4, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb1, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbe, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -19900,7 +19624,7 @@ const unsigned char glsl_matmul_output_BIAS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_output_BIAS_comp_len = 2856; +unsigned int glsl_matmul_output_BIAS_comp_len = 2784; const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -19941,17 +19665,11 @@ const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = { 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xa3, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa3, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa3, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xa4, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb1, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xbe, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd1, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd1, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -20160,7 +19878,7 @@ const unsigned char glsl_matmul_output_TRANSPOSE_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_output_TRANSPOSE_comp_len = 3064; +unsigned int glsl_matmul_output_TRANSPOSE_comp_len = 2992; const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -20205,17 +19923,11 @@ const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = { 0x36, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb7, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb7, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xb8, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xc5, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd2, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdf, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe8, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -20438,7 +20150,7 @@ const unsigned char glsl_matmul_output_TRANSPOSE_BIAS_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x22, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_matmul_output_TRANSPOSE_BIAS_comp_len = 3288; +unsigned int glsl_matmul_output_TRANSPOSE_BIAS_comp_len = 3216; const unsigned char glsl_gemm16x16_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -20471,34 +20183,16 @@ const unsigned char glsl_gemm16x16_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x60, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6f, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x7e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x94, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x31, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x31, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x31, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x31, 0x01, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x31, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x32, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x3a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x4a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x31, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, @@ -20777,7 +20471,7 @@ const unsigned char glsl_gemm16x16_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gemm16x16_comp_len = 4012; +unsigned int glsl_gemm16x16_comp_len = 3796; const unsigned char glsl_gemm16x16_FP16_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -20814,34 +20508,16 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = { 0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x87, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xa2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xac, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x13, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x13, 0x01, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x13, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x14, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x1d, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x26, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x2f, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x39, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x39, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -21138,7 +20814,7 @@ const unsigned char glsl_gemm16x16_FP16_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_gemm16x16_FP16_comp_len = 4276; +unsigned int glsl_gemm16x16_FP16_comp_len = 4060; const unsigned char glsl_deconvolutionDepthwise_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -21193,30 +20869,19 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xd6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd6, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xd7, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xe9, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -21494,7 +21159,7 @@ const unsigned char glsl_deconvolutionDepthwise_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvolutionDepthwise_comp_len = 4216; +unsigned int glsl_deconvolutionDepthwise_comp_len = 4084; const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -21549,30 +21214,19 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xda, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xda, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdb, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xed, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xed, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -21856,7 +21510,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4288; +unsigned int glsl_deconvolutionDepthwise_RELU_comp_len = 4156; const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -21911,30 +21565,19 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = { 0x47, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x53, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x53, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x58, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xad, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xad, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xbf, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xca, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xdc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdc, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xef, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -22222,7 +21865,7 @@ const unsigned char glsl_deconvolutionDepthwise_RELU6_comp[] = { 0xf8, 0x00, 0x02, 0x00, 0x32, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4336; +unsigned int glsl_deconvolutionDepthwise_RELU6_comp_len = 4204; const unsigned char glsl_preluWithChannel_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -22259,24 +21902,15 @@ const unsigned char glsl_preluWithChannel_comp[] = { 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x5b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x5f, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, @@ -22400,7 +22034,7 @@ const unsigned char glsl_preluWithChannel_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x3a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3a, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_preluWithChannel_comp_len = 2088; +unsigned int glsl_preluWithChannel_comp_len = 1980; const unsigned char glsl_deconvIm2Col_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -22454,24 +22088,16 @@ const unsigned char glsl_deconvIm2Col_comp[] = { 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x01, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -22760,7 +22386,7 @@ const unsigned char glsl_deconvIm2Col_comp[] = { 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvIm2Col_comp_len = 4268; +unsigned int glsl_deconvIm2Col_comp_len = 4172; const unsigned char glsl_deconvIm2Col_RELU_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -22814,24 +22440,16 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = { 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x04, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x04, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x04, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x04, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x05, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x17, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x17, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -23126,7 +22744,7 @@ const unsigned char glsl_deconvIm2Col_RELU_comp[] = { 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvIm2Col_RELU_comp_len = 4340; +unsigned int glsl_deconvIm2Col_RELU_comp_len = 4244; const unsigned char glsl_deconvIm2Col_RELU6_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -23180,24 +22798,16 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = { 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x03, 0x00, 0xee, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xee, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x06, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x06, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x06, 0x01, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x06, 0x01, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x07, 0x01, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x01, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, @@ -23496,7 +23106,7 @@ const unsigned char glsl_deconvIm2Col_RELU6_comp[] = { 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4388; +unsigned int glsl_deconvIm2Col_RELU6_comp_len = 4292; const unsigned char glsl_buffer2Image1D_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, @@ -23667,7 +23277,6 @@ const unsigned char glsl_scale_comp[] = { 0x47, 0x00, 0x04, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x6c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x71, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, @@ -23791,7 +23400,365 @@ const unsigned char glsl_scale_comp[] = { 0xf9, 0x00, 0x02, 0x00, 0x24, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x24, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 }; -unsigned int glsl_scale_comp_len = 2196; +unsigned int glsl_scale_comp_len = 2184; + +const unsigned char glsl_argmax_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75, + 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x48, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, + 0x75, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, + 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, + 0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, + 0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x83, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, + 0x5a, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x61, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0xba, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x68, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x8a, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_argmax_comp_len = 2096; + +const unsigned char glsl_argmax_ARGMIN_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75, + 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x48, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x0a, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, + 0x75, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, + 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, + 0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, + 0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x83, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, + 0x5a, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x53, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x61, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x68, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x8a, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x55, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00, + 0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_argmax_ARGMIN_comp_len = 2096; const unsigned char glsl_buffer2Image3D_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, diff --git a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp index 26e804ec4..5575d39eb 100644 --- a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp +++ b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp @@ -109,6 +109,8 @@ mMaps.insert(std::make_pair("glsl_deconvIm2Col_RELU_comp", std::make_pair(glsl_d mMaps.insert(std::make_pair("glsl_deconvIm2Col_RELU6_comp", std::make_pair(glsl_deconvIm2Col_RELU6_comp,glsl_deconvIm2Col_RELU6_comp_len))); mMaps.insert(std::make_pair("glsl_buffer2Image1D_comp", std::make_pair(glsl_buffer2Image1D_comp,glsl_buffer2Image1D_comp_len))); mMaps.insert(std::make_pair("glsl_scale_comp", std::make_pair(glsl_scale_comp,glsl_scale_comp_len))); +mMaps.insert(std::make_pair("glsl_argmax_comp", std::make_pair(glsl_argmax_comp,glsl_argmax_comp_len))); +mMaps.insert(std::make_pair("glsl_argmax_ARGMIN_comp", std::make_pair(glsl_argmax_ARGMIN_comp,glsl_argmax_ARGMIN_comp_len))); mMaps.insert(std::make_pair("glsl_buffer2Image3D_comp", std::make_pair(glsl_buffer2Image3D_comp,glsl_buffer2Image3D_comp_len))); } } diff --git a/source/backend/vulkan/image/compiler/makeshader.py b/source/backend/vulkan/image/compiler/makeshader.py index 42a904a86..f94765441 100755 --- a/source/backend/vulkan/image/compiler/makeshader.py +++ b/source/backend/vulkan/image/compiler/makeshader.py @@ -405,7 +405,8 @@ def genCppFile(objs, inc, dst): if len(spirv_save) > 0: out = spirv_save rm = False - print(os.popen("glslangValidator -V " + s + " -Os -o " + out).read()) + cmd = "glslangValidator -V " + s + " -Os -o " + out + print(os.popen(cmd).read()) else: out = spirv_cache rm = False diff --git a/source/backend/vulkan/image/execution/VulkanArgMax.cpp b/source/backend/vulkan/image/execution/VulkanArgMax.cpp new file mode 100644 index 000000000..cc97cdb74 --- /dev/null +++ b/source/backend/vulkan/image/execution/VulkanArgMax.cpp @@ -0,0 +1,129 @@ +// +// VulkanArgMax.cpp +// MNN +// +// Created by MNN on 2024/08/20. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "VulkanArgMax.hpp" + +namespace MNN { + +struct GpuArgMaxParam { + ivec4 size; // inside, mid, outside, 0 +}; + +VulkanArgMax::VulkanArgMax(const Op* op, Backend* bn) : VulkanBasicExecution(bn) { + auto vkBn = (VulkanBackend *)backend(); + + mAxis = op->main_as_ArgMax()->axis(); + + std::vector types{ + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }; + if (op->type() == OpType_ArgMax) { + mArgMaxPipeline = + vkBn->getPipeline("glsl_argmax_comp", types); + } else { + MNN_ASSERT(op->type() == OpType_ArgMin); + mArgMaxPipeline = + vkBn->getPipeline("glsl_argmax_ARGMIN_comp", types); + } + + mGpuArgMaxParam.reset(new VulkanBuffer(vkBn->getMemoryPool(), false, sizeof(GpuArgMaxParam), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)); + mDescriptorSet.reset(mArgMaxPipeline->createSet()); +} + +VulkanArgMax::~VulkanArgMax() { +} + +// set descriptorSet, including output, input and GPU param +ErrorCode VulkanArgMax::onEncode(const std::vector& inputs, const std::vector& outputs, + const VulkanCommandPool::Buffer* cmdBuffer) { + auto vkBn = (VulkanBackend*)backend(); + auto input = inputs[0]; + auto output = outputs[0]; + + // set GPU param + auto axis = mAxis; + if (axis < 0) { + axis = input->dimensions() + axis; + } + int inside = 1; + int outside = 1; + int mid = input->length(axis); + for (int i=0; ilength(i); + } + for (int i=axis+1; idimensions(); ++i) { + inside *= input->length(i); + } + auto total = outside * inside; + + auto Argmax = reinterpret_cast(mGpuArgMaxParam->map()); + Argmax->size[0] = inside; + Argmax->size[1] = mid; + Argmax->size[2] = outside; + Argmax->size[3] = 0; + mGpuArgMaxParam->unmap(); + + // set necessary storages, set descriptorSet and bind commandBuffer + { + int bufferSizeSource = sizeof(float); + for (int i=0; idimensions(); ++i) { + bufferSizeSource *= input->length(i); + } + mSource.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); + mSource.convert.reset(new VulkanImageConverter(vkBn)); + } + { + int bufferSizeOutput = sizeof(float); + for (int i=0; idimensions(); ++i) { + bufferSizeOutput *= output->length(i); + } + mOutput.convert.reset(new VulkanImageConverter(vkBn)); + mOutput.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeOutput, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); + } + + mSource.convert->encodeTensorToBuffer(input, mSource.buffer->buffer(), mSource.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(input), cmdBuffer); + + mDescriptorSet->writeBuffer(mOutput.buffer->buffer(), 0, mOutput.buffer->size()); + mDescriptorSet->writeBuffer(mSource.buffer->buffer(), 1, mSource.buffer->size()); + mDescriptorSet->writeBuffer(mGpuArgMaxParam->buffer(), 2, mGpuArgMaxParam->size()); + + cmdBuffer->barrierSource(mSource.buffer->buffer(), 0, mSource.buffer->size()); + + mArgMaxPipeline->bind(cmdBuffer->get(), mDescriptorSet->get()); + vkCmdDispatch(cmdBuffer->get(), UP_DIV(total, 256), 1, 1); + + cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size()); + mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer); + { + mSource.buffer->release(); + mOutput.buffer->release(); + } + return NO_ERROR; +} + +class VulkanArgMaxCreator : public VulkanBackend::Creator { +public: + virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, + Backend* backend) const override { + if (TensorUtils::getDescribe(inputs[0])->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + // Don't support legency version + return nullptr; + } + return new VulkanArgMax(op, backend); + } +}; + +static bool gResistor = []() { + VulkanBackend::addCreator(OpType_ArgMax, new VulkanArgMaxCreator); + VulkanBackend::addCreator(OpType_ArgMin, new VulkanArgMaxCreator); + return true; +}(); + +} diff --git a/source/backend/vulkan/image/execution/VulkanArgMax.hpp b/source/backend/vulkan/image/execution/VulkanArgMax.hpp new file mode 100644 index 000000000..31d39c795 --- /dev/null +++ b/source/backend/vulkan/image/execution/VulkanArgMax.hpp @@ -0,0 +1,40 @@ +// +// VulkanArgMax.cpp +// MNN +// +// Created by MNN on 2024/08/20. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef VulkanArgMax_hpp +#define VulkanArgMax_hpp + +#include "VulkanBasicExecution.hpp" +#include "VulkanImageConverter.hpp" + +namespace MNN { +class VulkanArgMax : public VulkanBasicExecution { + +public: + VulkanArgMax(const Op* op, Backend* bn); + virtual ~VulkanArgMax(); + ErrorCode onEncode(const std::vector& inputs, const std::vector& outputs, + const VulkanCommandPool::Buffer* cmdBuffer) override; + +private: + const VulkanPipeline* mArgMaxPipeline; + std::shared_ptr mDescriptorSet; + std::shared_ptr mGpuArgMaxParam; + struct ConvertInfo { + const VulkanPipeline* pipeline; + std::shared_ptr convert; + std::shared_ptr buffer; + }; + ConvertInfo mSource; + ConvertInfo mOutput; + int mAxis; +}; + +} // namespace MNN + +#endif /* VulkanArgMax_hpp */ diff --git a/source/backend/vulkan/image/execution/VulkanBinary.cpp b/source/backend/vulkan/image/execution/VulkanBinary.cpp index ece019c90..2cc3e9037 100644 --- a/source/backend/vulkan/image/execution/VulkanBinary.cpp +++ b/source/backend/vulkan/image/execution/VulkanBinary.cpp @@ -173,6 +173,9 @@ class VulkanBinaryCreator : public VulkanBackend::Creator { virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { auto input0 = inputs[0]; + if (input0->getType().code != halide_type_float) { + return nullptr; + } auto image = TensorUtils::getDescribe(input0)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4; auto shader = _getShaderName(op, image); if (shader.empty()) { diff --git a/source/backend/vulkan/image/execution/VulkanConvolution.cpp b/source/backend/vulkan/image/execution/VulkanConvolution.cpp index 2730b504b..b0e3d7ab1 100644 --- a/source/backend/vulkan/image/execution/VulkanConvolution.cpp +++ b/source/backend/vulkan/image/execution/VulkanConvolution.cpp @@ -255,7 +255,7 @@ class VulkanConvolutionCreator : public VulkanBackend::Creator { return nullptr; } } - quanWeight = ConvolutionCommon::load(op->main_as_Convolution2D(), backend, true); + quanWeight = ConvolutionCommon::load(op, backend, true); srcCount = quanWeight->weightFloat.size() / (outputCount * fh * fw); source = quanWeight->weightFloat.get(); weightSize = quanWeight->weightFloat.size(); diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp b/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp index 22b906356..ae9dab1d5 100644 --- a/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp +++ b/source/backend/vulkan/image/execution/VulkanDeconvolution.cpp @@ -20,7 +20,8 @@ static void writeReorderBuffer(VulkanMatMul::Reorder::nchwBuffer& buffer, int co buffer.stride[3] = 1; } -VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector& inputs, const Convolution2D* conv) : VulkanBasicExecution(bn) { +VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector& inputs, const Op* op) : VulkanBasicExecution(bn) { + auto conv = op->main_as_Convolution2D(); mConvCommonOption = conv->common(); auto vkBn = (VulkanBackend*)bn; mConvParam = std::make_shared(vkBn->getMemoryPool(), false, @@ -34,7 +35,7 @@ VulkanDeconvolution::VulkanDeconvolution(Backend* bn, const std::vector const float* filterDataPtr = nullptr; int tempWeightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &filterDataPtr, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &filterDataPtr, &tempWeightSize); if (nullptr != filterDataPtr) { MNN_ASSERT(inputs.size() == 1); @@ -142,7 +143,7 @@ ErrorCode VulkanDeconvolution::onEncode(const std::vector& inputs, cons dstImage->barrierWrite(cmdBuffer->get()); (reinterpret_cast(src->deviceId()))->image()->barrierRead(cmdBuffer->get()); - + vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalInputSize, VulkanConvolutionCommon::gImage2ColLocal), 1, 1); } @@ -176,7 +177,7 @@ class VulkanDeconvolutionCreator : public VulkanBackend::Creator { public: virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { - return new VulkanDeconvolution(backend, inputs, op->main_as_Convolution2D()); + return new VulkanDeconvolution(backend, inputs, op); } }; diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp b/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp index e133193b7..daeec8950 100644 --- a/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp +++ b/source/backend/vulkan/image/execution/VulkanDeconvolution.hpp @@ -18,7 +18,7 @@ class VulkanDeconvolution : public VulkanBasicExecution { virtual ~VulkanDeconvolution() { } - VulkanDeconvolution(Backend* bn, const std::vector& inputs, const Convolution2D* conv); + VulkanDeconvolution(Backend* bn, const std::vector& inputs, const Op* op); virtual ErrorCode onEncode(const std::vector& inputs, const std::vector& outputs, const VulkanCommandPool::Buffer* cmdBuffer) override; diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp index 5f9855cc5..0863aaa47 100644 --- a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp +++ b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.cpp @@ -9,8 +9,9 @@ #include "VulkanDeconvolutionDepthwise.hpp" #include "core/Macro.h" namespace MNN { -VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Convolution2D* conv) +VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Op* op) : VulkanBasicExecution(bn) { + auto conv = op->main_as_Convolution2D(); mConvCommonOption = conv->common(); auto vkBn = (VulkanBackend*)bn; int outputC4 = UP_DIV(mConvCommonOption->outputCount(), 4); @@ -41,7 +42,7 @@ VulkanDeconvolutionDepthwise::VulkanDeconvolutionDepthwise(Backend* bn, const Co const float* tempWeight = nullptr; int tempWeightSize = 0; std::shared_ptr quanCommon; - ConvolutionCommon::getConvParameters(&quanCommon, bn, conv, &tempWeight, &tempWeightSize); + ConvolutionCommon::getConvParameters(&quanCommon, bn, op, &tempWeight, &tempWeightSize); for (int b = 0; b < co; ++b) { int b_4 = b / 4; @@ -112,7 +113,7 @@ class VulkanDeconvolutionDepthwiseCreator : public VulkanBackend::Creator { if (inputs.size() > 1) { return nullptr; } - return new VulkanDeconvolutionDepthwise(backend, op->main_as_Convolution2D()); + return new VulkanDeconvolutionDepthwise(backend, op); } }; diff --git a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp index 690031968..412bd1957 100644 --- a/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp +++ b/source/backend/vulkan/image/execution/VulkanDeconvolutionDepthwise.hpp @@ -17,7 +17,7 @@ class VulkanDeconvolutionDepthwise : public VulkanBasicExecution { virtual ~VulkanDeconvolutionDepthwise() { } - VulkanDeconvolutionDepthwise(Backend* bn, const Convolution2D* conv); + VulkanDeconvolutionDepthwise(Backend* bn, const Op* op); virtual ErrorCode onEncode(const std::vector& inputs, const std::vector& outputs, const VulkanCommandPool::Buffer* cmdBuffer) override; diff --git a/source/backend/vulkan/image/execution/VulkanRaster.cpp b/source/backend/vulkan/image/execution/VulkanRaster.cpp index d4d6e016b..d5cc81b07 100644 --- a/source/backend/vulkan/image/execution/VulkanRaster.cpp +++ b/source/backend/vulkan/image/execution/VulkanRaster.cpp @@ -236,6 +236,9 @@ ErrorCode VulkanRaster::onEncode(const std::vector &___inputs, const s class VulkanRasterCreator : public VulkanBackend::Creator { public: virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* bn) const override { + if (outputs[0]->getType().bytes() < 4) { + return nullptr; + } return new VulkanRaster(bn); } }; diff --git a/source/backend/vulkan/image/execution/VulkanUnary.cpp b/source/backend/vulkan/image/execution/VulkanUnary.cpp index 574ccfcc2..839c43d48 100644 --- a/source/backend/vulkan/image/execution/VulkanUnary.cpp +++ b/source/backend/vulkan/image/execution/VulkanUnary.cpp @@ -74,6 +74,8 @@ static std::string _getMidType(const Op* op) { SETTYPE(UnaryOpOperation_ROUND, "ROUND"); SETTYPE(UnaryOpOperation_HARDSWISH, "HARDSWISH"); SETTYPE(UnaryOpOperation_GELU, "GELU"); + // Since SPIR-V lacks a built-in erf (gauss error function) instruction and the existing shader implementation of GELU is essentially an approximation of erf, there is no need to add a new implementation of GELU_STANDARD. + SETTYPE(UnaryOpOperation_GELU_STANDARD, "GELU"); } while(false); #undef SETTYPE } diff --git a/source/backend/vulkan/image/execution/glsl/argmax.comp b/source/backend/vulkan/image/execution/glsl/argmax.comp new file mode 100644 index 000000000..4a52df6d8 --- /dev/null +++ b/source/backend/vulkan/image/execution/glsl/argmax.comp @@ -0,0 +1,51 @@ +#version 440 core +#define FLOAT float + +layout(std430) buffer; +layout(set=0, binding=0) writeonly buffer destBuffer{ + int data[]; +} uOutput; + +layout(set=0, binding=1) readonly buffer sourceBuffer0{ + FLOAT data[]; +} uInput; + +layout(set=0, binding=2) uniform constBuffer { + int w; //inside + int h; //axis + int c; //outside + float k; // 0 +}uConst; + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +void main() +{ + ivec3 posTmp = ivec3(gl_GlobalInvocationID); + ivec2 pos; + pos.x = posTmp.x / uConst.w; + pos.y = posTmp.x % uConst.w; + // x: index in outside, y: index in inside + if(pos.y < uConst.w && pos.x < uConst.c) + { + int basicOffset = pos.x * uConst.w * uConst.h + pos.y; + FLOAT value = uInput.data[basicOffset]; + int index = 0; + for(int i = 1; i < uConst.h; ++i) + { + FLOAT valueCurr = uInput.data[basicOffset + i * uConst.w]; +#ifndef ARGMIN + if (valueCurr > value) { + value = valueCurr; + index = i; + } +#else + if (valueCurr < value) { + value = valueCurr; + index = i; + } +#endif + } + uOutput.data[posTmp.x] = index; + } +} diff --git a/source/backend/vulkan/image/execution/glsl/avgpool.comp b/source/backend/vulkan/image/execution/glsl/avgpool.comp index 5548222d2..a0b46a905 100644 --- a/source/backend/vulkan/image/execution/glsl/avgpool.comp +++ b/source/backend/vulkan/image/execution/glsl/avgpool.comp @@ -1,6 +1,6 @@ #version 440 core layout(std140) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; diff --git a/source/backend/vulkan/image/execution/glsl/binaryImage.comp b/source/backend/vulkan/image/execution/glsl/binaryImage.comp index 5e79b2256..c7d25085b 100644 --- a/source/backend/vulkan/image/execution/glsl/binaryImage.comp +++ b/source/backend/vulkan/image/execution/glsl/binaryImage.comp @@ -1,5 +1,5 @@ #version 440 core -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput0; layout(set=0, binding=2) uniform sampler2D uInput1; diff --git a/source/backend/vulkan/image/execution/glsl/blit_image.comp b/source/backend/vulkan/image/execution/glsl/blit_image.comp index d1d6eeca5..4c2d000b4 100644 --- a/source/backend/vulkan/image/execution/glsl/blit_image.comp +++ b/source/backend/vulkan/image/execution/glsl/blit_image.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform constBuffer{ diff --git a/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp b/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp index b15b3464f..b34b48c96 100644 --- a/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp +++ b/source/backend/vulkan/image/execution/glsl/convolutionDepthwise.comp @@ -1,15 +1,13 @@ #version 440 core -layout(std140) buffer; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uKernel; -layout(set=0, binding=2) uniform mediump sampler2D uKernel; +layout(set=0, binding=3) uniform sampler2D uBias; -layout(set=0, binding=3) uniform mediump sampler2D uBias; - -layout(set=0, binding=4) uniform constBuffer { +layout(set=0, binding=4) readonly uniform constBuffer { ivec2 pad; ivec2 kernelSize; ivec2 stride; diff --git a/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp b/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp index f9d81e461..39d068b40 100644 --- a/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp +++ b/source/backend/vulkan/image/execution/glsl/convolutionDepthwiseMali.comp @@ -1,15 +1,13 @@ #version 440 core -layout(std140) buffer; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uKernel; -layout(set=0, binding=2) uniform mediump sampler2D uKernel; +layout(set=0, binding=3) uniform sampler2D uBias; -layout(set=0, binding=3) uniform mediump sampler2D uBias; - -layout(set=0, binding=4) uniform constBuffer { +layout(set=0, binding=4) readonly uniform constBuffer { ivec2 pad; ivec2 kernelSize; ivec2 stride; diff --git a/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp b/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp index 5a9283986..a394d537d 100644 --- a/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp +++ b/source/backend/vulkan/image/execution/glsl/deconvCol2Im.comp @@ -1,7 +1,7 @@ #version 440 core -layout(set=0, binding=0) uniform mediump sampler2D uInput; -layout(set=0, binding=1) writeonly uniform mediump image2D uOutput; +layout(set=0, binding=0) uniform sampler2D uInput; +layout(set=0, binding=1) writeonly uniform image2D uOutput; layout(set=0, binding=2) readonly uniform constBuffer { ivec2 pad; diff --git a/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp b/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp index a56c8fc2e..97dc19d28 100644 --- a/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp +++ b/source/backend/vulkan/image/execution/glsl/deconvIm2Col.comp @@ -1,9 +1,9 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uBias; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uBias; layout(set=0, binding=3) readonly uniform constBuffer { ivec2 pad; diff --git a/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp b/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp index c22eb2d49..32b5e9219 100644 --- a/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp +++ b/source/backend/vulkan/image/execution/glsl/deconvolutionDepthwise.comp @@ -1,10 +1,10 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uKernel; -layout(set=0, binding=3) uniform mediump sampler2D uBias; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uKernel; +layout(set=0, binding=3) uniform sampler2D uBias; layout(set=0, binding=4) readonly uniform constBuffer { ivec2 pad; diff --git a/source/backend/vulkan/image/execution/glsl/fill_image.comp b/source/backend/vulkan/image/execution/glsl/fill_image.comp index abba311f6..9a4818051 100644 --- a/source/backend/vulkan/image/execution/glsl/fill_image.comp +++ b/source/backend/vulkan/image/execution/glsl/fill_image.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform constBuffer{ vec4 value; ivec4 imageSize; diff --git a/source/backend/vulkan/image/execution/glsl/gemm16x16.comp b/source/backend/vulkan/image/execution/glsl/gemm16x16.comp index 636756974..b71b66207 100644 --- a/source/backend/vulkan/image/execution/glsl/gemm16x16.comp +++ b/source/backend/vulkan/image/execution/glsl/gemm16x16.comp @@ -9,11 +9,11 @@ #define MAT4 mat4 #endif layout(std140) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uKernel; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uKernel; -layout(set=0, binding=3) readonly restrict uniform constBuffer { +layout(set=0, binding=3) readonly uniform constBuffer { ivec4 outputSize; }uConst; diff --git a/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp b/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp index 801108ed0..358b42c1b 100644 --- a/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp +++ b/source/backend/vulkan/image/execution/glsl/gridSampleBilinear.comp @@ -1,9 +1,9 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uGrid; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uGrid; layout(set=0, binding=3) uniform gridSampleBuffer{ ivec4 outImgSize; diff --git a/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp b/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp index fa8a9f041..20adc3a93 100644 --- a/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp +++ b/source/backend/vulkan/image/execution/glsl/gridSampleNearest.comp @@ -1,9 +1,9 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uGrid; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uGrid; layout(set=0, binding=3) uniform gridSampleBuffer{ ivec4 outImgSize; diff --git a/source/backend/vulkan/image/execution/glsl/im2col.comp b/source/backend/vulkan/image/execution/glsl/im2col.comp index f62ee3861..9485c931e 100644 --- a/source/backend/vulkan/image/execution/glsl/im2col.comp +++ b/source/backend/vulkan/image/execution/glsl/im2col.comp @@ -1,8 +1,8 @@ #version 440 core layout(std140) buffer; -layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) readonly uniform constBuffer { ivec2 pad; diff --git a/source/backend/vulkan/image/execution/glsl/im2col1x1.comp b/source/backend/vulkan/image/execution/glsl/im2col1x1.comp index bbac4e8f4..ac4306af5 100644 --- a/source/backend/vulkan/image/execution/glsl/im2col1x1.comp +++ b/source/backend/vulkan/image/execution/glsl/im2col1x1.comp @@ -1,8 +1,8 @@ #version 440 core layout(std140) buffer; -layout(set=0, binding=0) writeonly mediump uniform image2D uOutput; -layout(set=0, binding=1) mediump uniform sampler2D uInput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) readonly uniform constBuffer { ivec2 pad; diff --git a/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp b/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp index 66beb5094..afcb45b1c 100644 --- a/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp +++ b/source/backend/vulkan/image/execution/glsl/imageTonc4hw4.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) uniform mediump sampler2D uInput; +layout(set=0, binding=0) uniform sampler2D uInput; layout(set=0, binding=1) writeonly buffer destBuffer{ vec4 data[]; diff --git a/source/backend/vulkan/image/execution/glsl/imageTonchw.comp b/source/backend/vulkan/image/execution/glsl/imageTonchw.comp index 4b7d3d694..de2f4b7e3 100644 --- a/source/backend/vulkan/image/execution/glsl/imageTonchw.comp +++ b/source/backend/vulkan/image/execution/glsl/imageTonchw.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) uniform mediump sampler2D uInput; +layout(set=0, binding=0) uniform sampler2D uInput; layout(set=0, binding=1) writeonly buffer destBuffer{ float data[]; diff --git a/source/backend/vulkan/image/execution/glsl/macro.json b/source/backend/vulkan/image/execution/glsl/macro.json index ff07a939e..3c964eff1 100644 --- a/source/backend/vulkan/image/execution/glsl/macro.json +++ b/source/backend/vulkan/image/execution/glsl/macro.json @@ -1,4 +1,7 @@ { + "argmax.comp":[ + "ARGMIN" + ], "matmul_output.comp":[ "BIAS", "TRANSPOSE", diff --git a/source/backend/vulkan/image/execution/glsl/matmul_input.comp b/source/backend/vulkan/image/execution/glsl/matmul_input.comp index 5f2d76109..d413e650a 100644 --- a/source/backend/vulkan/image/execution/glsl/matmul_input.comp +++ b/source/backend/vulkan/image/execution/glsl/matmul_input.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform offsetBuffer { ivec4 size;//w/4, h/4, hLimit, w/4*h/4 diff --git a/source/backend/vulkan/image/execution/glsl/matmul_output.comp b/source/backend/vulkan/image/execution/glsl/matmul_output.comp index d65483b74..6f994c114 100644 --- a/source/backend/vulkan/image/execution/glsl/matmul_output.comp +++ b/source/backend/vulkan/image/execution/glsl/matmul_output.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform offsetBuffer { ivec4 size;//w/4, h/4, w, w/4*h/4 diff --git a/source/backend/vulkan/image/execution/glsl/maxpool.comp b/source/backend/vulkan/image/execution/glsl/maxpool.comp index 222b9cb5e..c644709d0 100644 --- a/source/backend/vulkan/image/execution/glsl/maxpool.comp +++ b/source/backend/vulkan/image/execution/glsl/maxpool.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; diff --git a/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp b/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp index 26a9f867a..f9bfb0b46 100644 --- a/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp +++ b/source/backend/vulkan/image/execution/glsl/nc4hw4toimage.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) readonly buffer destBuffer{ vec4 data[]; diff --git a/source/backend/vulkan/image/execution/glsl/nchwToimage.comp b/source/backend/vulkan/image/execution/glsl/nchwToimage.comp index 34db019bc..ac240e46a 100644 --- a/source/backend/vulkan/image/execution/glsl/nchwToimage.comp +++ b/source/backend/vulkan/image/execution/glsl/nchwToimage.comp @@ -1,6 +1,6 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) readonly buffer sourceBuffer{ float data[]; diff --git a/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp b/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp index 43ec6b1ff..6705b1cee 100644 --- a/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp +++ b/source/backend/vulkan/image/execution/glsl/packAsImage4x4.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict highp uniform image2D uOutput; +layout(set=0, binding=0) writeonly highp uniform image2D uOutput; layout(set=0, binding=1) readonly buffer sourceBuffer{ vec4 data[]; diff --git a/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp b/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp index b9be73066..79556d6b1 100644 --- a/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp +++ b/source/backend/vulkan/image/execution/glsl/preluWithChannel.comp @@ -1,9 +1,9 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; -layout(set=0, binding=2) uniform mediump sampler2D uSlope; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; +layout(set=0, binding=2) uniform sampler2D uSlope; layout(set = 0, binding = 3) uniform reluBuffer{ ivec4 imgSize; diff --git a/source/backend/vulkan/image/execution/glsl/relu.comp b/source/backend/vulkan/image/execution/glsl/relu.comp index 118f716bd..0597d8aaa 100644 --- a/source/backend/vulkan/image/execution/glsl/relu.comp +++ b/source/backend/vulkan/image/execution/glsl/relu.comp @@ -1,7 +1,7 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; layout(set = 0, binding = 2) uniform reluBuffer{ ivec4 imgSize; diff --git a/source/backend/vulkan/image/execution/glsl/relu6.comp b/source/backend/vulkan/image/execution/glsl/relu6.comp index 84d4da59e..60f5f1fba 100644 --- a/source/backend/vulkan/image/execution/glsl/relu6.comp +++ b/source/backend/vulkan/image/execution/glsl/relu6.comp @@ -1,7 +1,7 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict mediump uniform image2D uOutput; -layout(set=0, binding=1) uniform mediump sampler2D uInput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; +layout(set=0, binding=1) uniform sampler2D uInput; layout(set = 0, binding = 2) uniform reluBuffer{ ivec4 imgSize; diff --git a/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp b/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp index 2740f3776..83902f7a9 100644 --- a/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp +++ b/source/backend/vulkan/image/execution/glsl/resizeBilinear.comp @@ -1,8 +1,8 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) uniform mediump sampler2D uInput; -layout(set=0, binding=1) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) uniform sampler2D uInput; +layout(set=0, binding=1) writeonly uniform image2D uOutput; layout(set = 0, binding = 2) uniform reluBuffer{ ivec4 inImgSize; diff --git a/source/backend/vulkan/image/execution/glsl/resizeNearest.comp b/source/backend/vulkan/image/execution/glsl/resizeNearest.comp index ae12f258f..352dc6e28 100644 --- a/source/backend/vulkan/image/execution/glsl/resizeNearest.comp +++ b/source/backend/vulkan/image/execution/glsl/resizeNearest.comp @@ -1,8 +1,8 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) uniform mediump sampler2D uInput; -layout(set=0, binding=1) writeonly restrict mediump uniform image2D uOutput; +layout(set=0, binding=0) uniform sampler2D uInput; +layout(set=0, binding=1) writeonly uniform image2D uOutput; layout(set = 0, binding = 2) uniform reluBuffer{ ivec4 inImgSize; diff --git a/source/backend/vulkan/image/execution/glsl/roipooling.comp b/source/backend/vulkan/image/execution/glsl/roipooling.comp index 36db110aa..f0c7a582f 100644 --- a/source/backend/vulkan/image/execution/glsl/roipooling.comp +++ b/source/backend/vulkan/image/execution/glsl/roipooling.comp @@ -1,6 +1,6 @@ #version 440 core layout(std140) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform sampler2D uRoI; diff --git a/source/backend/vulkan/image/execution/glsl/scale.comp b/source/backend/vulkan/image/execution/glsl/scale.comp index 52b53d9a5..1ead1fa41 100644 --- a/source/backend/vulkan/image/execution/glsl/scale.comp +++ b/source/backend/vulkan/image/execution/glsl/scale.comp @@ -1,6 +1,6 @@ #version 440 core layout(std140) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set = 0, binding = 2) readonly buffer scaleBuffer{ diff --git a/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp b/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp index c10f6812c..57b22d56f 100644 --- a/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp +++ b/source/backend/vulkan/image/execution/glsl/unPackImage4x4.comp @@ -1,6 +1,6 @@ #version 440 core layout(std430) buffer; -layout(set=0, binding=0) uniform mediump sampler2D uInput; +layout(set=0, binding=0) uniform sampler2D uInput; layout(set=0, binding=1) writeonly buffer sourceBuffer{ vec4 data[]; diff --git a/source/backend/vulkan/image/execution/glsl/unaryImage.comp b/source/backend/vulkan/image/execution/glsl/unaryImage.comp index 900ab129d..6483e2bf6 100644 --- a/source/backend/vulkan/image/execution/glsl/unaryImage.comp +++ b/source/backend/vulkan/image/execution/glsl/unaryImage.comp @@ -1,5 +1,5 @@ #version 440 -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform constBuffer{ ivec4 size; // x: limit, y: channelC4*b, z:height, w:width diff --git a/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp b/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp index a5b0e600c..1317d1fa5 100644 --- a/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp +++ b/source/backend/vulkan/image/execution/glsl/winogradTransformDest2_3_1.comp @@ -1,9 +1,9 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; layout(set=0, binding=2) uniform sampler2D uBias; -layout(set=0, binding=3) readonly restrict uniform constBuffer { +layout(set=0, binding=3) readonly uniform constBuffer { ivec4 inputSize; ivec4 outputSize; int padX; @@ -12,7 +12,7 @@ layout(set=0, binding=3) readonly restrict uniform constBuffer { int unitHeight; int unit; } uConst; -layout(set=0, binding=4) readonly restrict uniform offsetBuffer { +layout(set=0, binding=4) readonly uniform offsetBuffer { ivec2 offset; } uOffset; diff --git a/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp b/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp index 52ac4c6f6..6b30d22c3 100644 --- a/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp +++ b/source/backend/vulkan/image/execution/glsl/winogradTransformSource2_3_1.comp @@ -1,8 +1,8 @@ #version 450 core layout(std430) buffer; -layout(set=0, binding=0) writeonly restrict uniform image2D uOutput; +layout(set=0, binding=0) writeonly uniform image2D uOutput; layout(set=0, binding=1) uniform sampler2D uInput; -layout(set=0, binding=2) readonly restrict uniform constBuffer { +layout(set=0, binding=2) readonly uniform constBuffer { ivec4 inputSize; ivec4 outputSize; int padX; @@ -12,7 +12,7 @@ layout(set=0, binding=2) readonly restrict uniform constBuffer { int unit; } uConst; -layout(set=0, binding=3) readonly restrict uniform offsetBuffer { +layout(set=0, binding=3) readonly uniform offsetBuffer { ivec2 offset; } uOffset; int CLAMP_ADD(int x) { diff --git a/source/backend/vulkan/image/shaders/AllShader.h b/source/backend/vulkan/image/shaders/AllShader.h index ada1630e4..4297b2ced 100644 --- a/source/backend/vulkan/image/shaders/AllShader.h +++ b/source/backend/vulkan/image/shaders/AllShader.h @@ -212,6 +212,10 @@ extern const unsigned char glsl_buffer2Image1D_comp[]; extern unsigned int glsl_buffer2Image1D_comp_len; extern const unsigned char glsl_scale_comp[]; extern unsigned int glsl_scale_comp_len; +extern const unsigned char glsl_argmax_comp[]; +extern unsigned int glsl_argmax_comp_len; +extern const unsigned char glsl_argmax_ARGMIN_comp[]; +extern unsigned int glsl_argmax_ARGMIN_comp_len; extern const unsigned char glsl_buffer2Image3D_comp[]; extern unsigned int glsl_buffer2Image3D_comp_len; #endif \ No newline at end of file diff --git a/source/backend/vulkan/runtime/VulkanRuntime.cpp b/source/backend/vulkan/runtime/VulkanRuntime.cpp index 3ef3b722d..795c24f99 100644 --- a/source/backend/vulkan/runtime/VulkanRuntime.cpp +++ b/source/backend/vulkan/runtime/VulkanRuntime.cpp @@ -34,22 +34,40 @@ class VulkanBufferAllocator : public BufferAllocator::Allocator { float VulkanRuntime::onGetMemoryInMB() { return mMemoryPool->computeSize(); } - -VulkanRuntime::VulkanRuntime(const Backend::Info& info) { - mInfo = info; +VulkanRuntime* VulkanRuntime::create(const Backend::Info& info) { MNNVulkanContext* context = nullptr; + std::shared_ptr device; + std::shared_ptr instance; if (nullptr != info.user && nullptr != info.user->sharedContext) { MNN_PRINT("Use user's vulkan context\n"); context = static_cast(info.user->sharedContext); } if (NULL != context) { - mInstance = std::make_shared(context->pInstance); - mDevice = std::make_shared(mInstance, context->pPhysicalDevice, context->pDevice, + instance = std::make_shared(context->pInstance); + if (context->pInstance == VK_NULL_HANDLE) { + MNN_ERROR("Invalide user's vulkan instance\n"); + return nullptr; + } + device = std::make_shared(instance, context->pPhysicalDevice, context->pDevice, context->iQueueFamilyIndex, context->pQueue); } else { - mInstance = std::make_shared(); - mDevice = std::make_shared(mInstance); + instance = std::make_shared(); + if (!instance->supportVulkan()) { + MNN_ERROR("Invalide device for support vulkan\n"); + return nullptr; + } + device = std::make_shared(instance); + } + if (device->get() == VK_NULL_HANDLE) { + return nullptr; } + return new VulkanRuntime(info, device, instance); +} + +VulkanRuntime::VulkanRuntime(const Backend::Info& info, std::shared_ptr device, std::shared_ptr instance) { + mInfo = info; + mDevice = device; + mInstance = instance; auto& dev = *mDevice; mCmdPool = std::make_shared(dev); //GFlops, Test by mobilenet v1's ms @@ -168,31 +186,11 @@ int VulkanRuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const { } return 0; } -static bool _testVulkan() { - // std::make_unique need c++14 - std::unique_ptr instance(new VulkanInstance()); - if (nullptr == instance) { - MNN_ERROR("Invalide device for support vulkan\n"); - return false; - } - if (!instance->success()) { - MNN_ERROR("Invalide device for support vulkan\n"); - return false; - } - if (!instance->supportVulkan()) { - MNN_ERROR("Invalide device for support vulkan\n"); - return false; - } - return true; -} - class VulkanRuntimeCreator : public RuntimeCreator { public: virtual Runtime* onCreate(const Backend::Info& info) const { if (InitVulkan()) { - if (_testVulkan()) { - return new VulkanRuntime(info); - } + return VulkanRuntime::create(info); } return nullptr; } diff --git a/source/backend/vulkan/runtime/VulkanRuntime.hpp b/source/backend/vulkan/runtime/VulkanRuntime.hpp index ab9edbab4..c8dfa56ac 100644 --- a/source/backend/vulkan/runtime/VulkanRuntime.hpp +++ b/source/backend/vulkan/runtime/VulkanRuntime.hpp @@ -24,7 +24,6 @@ namespace MNN { class VulkanRuntime : public Runtime { public: - VulkanRuntime(const Backend::Info& info); virtual ~ VulkanRuntime(); virtual Backend* onCreate(const BackendConfig* config) const override; @@ -34,7 +33,9 @@ class VulkanRuntime : public Runtime { int onGetRuntimeStatus(RuntimeStatus statusEnum) const override; std::shared_ptr allocUniform(const void* src = nullptr, int size = 0); void recycleUniform(std::shared_ptr buffer); + static VulkanRuntime* create(const Backend::Info& info); private: + VulkanRuntime(const Backend::Info& info, std::shared_ptr device, std::shared_ptr instance); Backend::Info mInfo; std::shared_ptr mBufferPool; std::shared_ptr mPipelineFactory; diff --git a/source/backend/vulkan/vulkan/vulkan_core.h b/source/backend/vulkan/vulkan/vulkan_core.h index 228e4ef6e..67a14f7bb 100644 --- a/source/backend/vulkan/vulkan/vulkan_core.h +++ b/source/backend/vulkan/vulkan/vulkan_core.h @@ -2078,6 +2078,12 @@ typedef enum VkImageCreateFlagBits { } VkImageCreateFlagBits; typedef VkFlags VkImageCreateFlags; +// Introduced from "vulkan_core.h" in Vulkan SDK Version 1.3.290.0. +typedef enum VkInstanceCreateFlagBits { + VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001 +} VkInstanceCreateFlagBits; +typedef VkFlags VkInstanceCreateFlags; + typedef enum VkSampleCountFlagBits { VK_SAMPLE_COUNT_1_BIT = 0x00000001, VK_SAMPLE_COUNT_2_BIT = 0x00000002, @@ -9466,7 +9472,8 @@ typedef VkFormatFeatureFlagBits2 VkFormatFeatureFlagBits2KHR; typedef VkFormatProperties3 VkFormatProperties3KHR; - +// Introduced from "vulkan_core.h" in Vulkan SDK Version 1.3.290.0. +#define VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME "VK_KHR_portability_enumeration" #define VK_KHR_maintenance4 1 #define VK_KHR_MAINTENANCE_4_SPEC_VERSION 2 diff --git a/source/core/Backend.cpp b/source/core/Backend.cpp index 50bda5066..c1d9a267b 100644 --- a/source/core/Backend.cpp +++ b/source/core/Backend.cpp @@ -69,22 +69,6 @@ void registerBackend() { #if MNN_METAL_ENABLED registerMetalRuntimeCreator(); #endif - auto& gExtraCreator = GetExtraCreator(); - for(auto iter = gExtraCreator.begin(); iter != gExtraCreator.end();){ - if(!iter->second.second){ - iter++; - }else{ - Backend::Info info; - info.type = iter->first; - std::shared_ptr bn(iter->second.first->onCreate(info)); - if (nullptr == bn.get()) { - iter = gExtraCreator.erase(iter); - MNN_ERROR("Error to use creator of %d, delete it\n", info.type); - }else{ - iter++; - } - } - } }); } diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp index 2605047ec..0d199bd90 100644 --- a/source/core/Backend.hpp +++ b/source/core/Backend.hpp @@ -39,6 +39,15 @@ struct RuntimeHint { // 2: Only quantize value cache, use fp8 quantization // 3: quantize both key and value cache as described above int kvcacheQuantOption = 0; + + // the kvcache size limit of each layer + // if the size of kvcache in memory exceeds the limit + // it will be moved to disk to save memory + // -1 for no limit + int kvcacheSizeLimit = -1; + + // path of the kvcache directory + std::string kvcacheDirPath = "/tmp"; }; /** abstract backend */ class Backend : public NonCopyable { @@ -263,7 +272,7 @@ class Runtime : public NonCopyable { /** @brief reset runtime */ - virtual void onReset(int numberThread, const BackendConfig* config) { + virtual void onReset(int numberThread, const BackendConfig* config, bool full) { // Do nothing } diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp index 1495bc7c1..43104da80 100644 --- a/source/core/BufferAllocator.cpp +++ b/source/core/BufferAllocator.cpp @@ -309,7 +309,7 @@ MemChunk DeferBufferAllocator::alloc(size_t size, bool separate, size_t align) { auto newChunk = createMemNode(size); insert_after(newChunk); #ifdef DUMP_USAGE - MNN_PRINT("Defer alloc: %p\n", newChunk); + MNN_PRINT("Defer alloc: %p, %d\n", newChunk, size); #endif return MemChunk(newChunk); } @@ -332,7 +332,7 @@ MemChunk DeferBufferAllocator::alloc(size_t size, bool separate, size_t align) { // equal no change; small expand selectChunk->size = size; #ifdef DUMP_USAGE - MNN_PRINT("Defer alloc: %p\n", selectChunk); + MNN_PRINT("Defer alloc: %p, %d\n", selectChunk, size); #endif return MemChunk(selectChunk); } diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp index 2418bd211..970b17288 100644 --- a/source/core/ConvolutionCommon.cpp +++ b/source/core/ConvolutionCommon.cpp @@ -16,20 +16,57 @@ namespace MNN { -std::shared_ptr ConvolutionCommon::load(const Convolution2D *conv, Backend* backend, bool forceFloat, bool forceInt8) { +std::shared_ptr ConvolutionCommon::load(const Op* op, Backend* backend, bool forceFloat, bool forceInt8) { + auto conv = op->main_as_Convolution2D(); auto quan = conv->quanParameter(); auto result = std::make_shared(); result->quan = quan; size_t buffer_size = 0, alpha_size = 0; const int8_t* buffer_ptr = nullptr; const float* alpha_ptr = nullptr; - if (quan->buffer()) { - buffer_size = quan->buffer()->size(); - buffer_ptr = quan->buffer()->data(); - } - if (quan->alpha()) { - alpha_size = quan->alpha()->size(); - alpha_ptr = quan->alpha()->data(); + std::unique_ptr external_buffer; + size_t weightLength = 0; + int8_t *buffer = nullptr; + if (USE_EXTERNAL_DATA(conv) && op->externalPath() && quan->buffer() == nullptr) { + // external data + auto external_info = conv->external()->data(); + std::unique_ptr external_file(new FileLoader(op->externalPath()->c_str())); + external_file->offset(external_info[0]); + buffer_size = external_info[1]; + if (0 != buffer_size) { + if (1 == quan->type() && !forceFloat) { + buffer = IDSTDecoder::ReadQuanData_c(external_file.get(), &weightLength, result.get(), quan->shapeInt32(), forceInt8); + } else { + external_buffer.reset(new int8_t[buffer_size]); + buffer_ptr = external_buffer.get(); + external_file->read((char*)buffer_ptr, buffer_size); + } + } + alpha_size = external_info[2] / sizeof(float); + if (0 != alpha_size) { + result->alpha.reset(alpha_size); + if (nullptr == result->alpha.get()) { + MNN_PRINT("Alloc memory error for extract idst int8\n"); + return nullptr; + } + alpha_ptr = result->alpha.get(); + external_file->read((char*)alpha_ptr, alpha_size * sizeof(float)); + } + } else { + if (quan->buffer()) { + buffer_size = quan->buffer()->size(); + buffer_ptr = quan->buffer()->data(); + } + if (quan->alpha()) { + alpha_size = quan->alpha()->size(); + alpha_ptr = quan->alpha()->data(); + result->alpha.reset(alpha_size); + if (nullptr == result->alpha.get()) { + MNN_PRINT("Alloc memory error for extract idst int8\n"); + return nullptr; + } + ::memcpy(result->alpha.get(), alpha_ptr, alpha_size * sizeof(float)); + } } if (quan->index() != nullptr) { if (forceFloat) { @@ -51,16 +88,15 @@ std::shared_ptr ConvolutionCommon::load(const Con } // Otherwise needn't treat, just return result with quan info return result; } - size_t weightLength = 0; - int8_t *buffer = nullptr; - auto originBuffer = (unsigned char *)buffer_ptr; - if (1 == quan->type()) { - buffer = IDSTDecoder::ReadQuanData_c(originBuffer, &weightLength, result.get(), quan->shapeInt32()); + std::unique_ptr originBuffer(new MemoryLoader((unsigned char*)buffer_ptr)); + if (1 == quan->type() && weightLength == 0) { + buffer = IDSTDecoder::ReadQuanData_c(originBuffer.get(), &weightLength, result.get(), quan->shapeInt32(), forceInt8); } if (2 == quan->type()) { - buffer = IDSTDecoder::ReadSparseQuanData_c(originBuffer, &weightLength, alpha_ptr, alpha_size, result.get(), quan->shapeInt32()); + buffer = IDSTDecoder::ReadSparseQuanData_c(originBuffer.get(), &weightLength, alpha_ptr, alpha_size, result.get(), quan->shapeInt32()); } + /* if (result->weightMap.size() > 0) { result->canUseInt4 = true; for (auto value : result->weightMap) { @@ -69,6 +105,7 @@ std::shared_ptr ConvolutionCommon::load(const Con } } } + */ // read fp16 data if (3 == quan->type()) { weightLength = buffer_size / sizeof(half_float::half); @@ -99,12 +136,6 @@ std::shared_ptr ConvolutionCommon::load(const Con } result->weight.set(buffer, weightLength); } - result->alpha.reset(alpha_size); - if (nullptr == result->alpha.get()) { - MNN_PRINT("Alloc memory error for extract idst int8\n"); - return nullptr; - } - ::memcpy(result->alpha.get(), alpha_ptr, alpha_size * sizeof(float)); { int outputCount = 0; bool oldType4 = (quan->type() == 4 && quan->aMin() == 0 && std::abs(quan->quantScale()) < 1e-6); @@ -128,9 +159,10 @@ std::shared_ptr ConvolutionCommon::load(const Con // for old type 4 models, their quan->quantScale is 0. which will introduce a bug here if (oldType4) { extraFactor = 1.0f; - } - for (int o=0; oalpha.size(); ++o) { - result->alpha.get()[o] *= extraFactor; + } else if (extraFactor != 1.0f) { + for (int o=0; oalpha.size(); ++o) { + result->alpha.get()[o] *= extraFactor; + } } } } @@ -172,12 +204,13 @@ std::shared_ptr ConvolutionCommon::load(const Con return result; } -void ConvolutionCommon::getConvParameters(std::shared_ptr *quanCommon, Backend* backend, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize) { +void ConvolutionCommon::getConvParameters(std::shared_ptr *quanCommon, Backend* backend, const MNN::Op *op, const float** originWeight, int* originWeightSize) { + auto conv2d = op->main_as_Convolution2D(); *originWeight = nullptr; *originWeightSize = 0; if (nullptr != conv2d->quanParameter()) { bool forceFloat = conv2d->quanParameter()->index() != nullptr; - *quanCommon = load(conv2d, backend, forceFloat); + *quanCommon = load(op, backend, forceFloat); *originWeight = (*quanCommon)->weightFloat.get(); *originWeightSize = (*quanCommon)->weightFloat.size(); } @@ -187,8 +220,9 @@ void ConvolutionCommon::getConvParameters(std::shared_ptr *quanCommo } } -bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr& quanCommon, Backend* backend, +bool ConvolutionCommon::getConvInt8Parameters(const MNN::Op* op, std::shared_ptr& quanCommon, Backend* backend, const int8_t*& weight, int& weightSize, float*& scale, int32_t*& bias, int32_t*& weightQuantZeroPoint) { + auto conv2d = op->main_as_Convolution2D(); int outputCount = conv2d->common()->outputCount(); weightSize = 0; auto core = static_cast(backend)->functions(); @@ -197,8 +231,8 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, weight = conv2d->symmetricQuan()->weight()->data(); weightSize = conv2d->symmetricQuan()->weight()->size(); } - if (conv2d->quanParameter() && conv2d->quanParameter()->buffer()) { // int8 weight - quanCommon = ConvolutionCommon::load(conv2d, backend, false, true); + if (conv2d->quanParameter() && (conv2d->quanParameter()->buffer() || conv2d->external())) { // int8 weight + quanCommon = ConvolutionCommon::load(op, backend, false, true); MNN_ASSERT(quanCommon != nullptr); weight = quanCommon->weight.get(); weightSize = quanCommon->weight.size(); @@ -211,6 +245,7 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Convolution2D* conv2d, if (quanCommon && quanCommon->asymmetric) { weightAsy = true; } + if (conv2d->symmetricQuan() && conv2d->symmetricQuan()->bias() && conv2d->symmetricQuan()->scale()) { // Compability for old model MNN_ASSERT(conv2d->symmetricQuan()->bias()->size() == outputCount && conv2d->symmetricQuan()->scale()->size() == outputCount); diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp index 28e3acf83..7b1bbd5f0 100644 --- a/source/core/ConvolutionCommon.hpp +++ b/source/core/ConvolutionCommon.hpp @@ -24,9 +24,9 @@ class MNN_PUBLIC ConvolutionCommon : public Execution { bool canUseInt4 = false; Backend* backend = nullptr; }; - static std::shared_ptr load(const Convolution2D* conv, Backend* backend = nullptr, bool forceFloat = false, bool forceInt8 = false); - static void getConvParameters(std::shared_ptr *quanCommon, Backend* backend, const MNN::Convolution2D *conv2d, const float** originWeight, int* originWeightSize); - static bool getConvInt8Parameters(const MNN::Convolution2D* conv2d, std::shared_ptr& quanCommon, Backend* backend, + static std::shared_ptr load(const Op* op, Backend* backend = nullptr, bool forceFloat = false, bool forceInt8 = false); + static void getConvParameters(std::shared_ptr *quanCommon, Backend* backend, const MNN::Op *op, const float** originWeight, int* originWeightSize); + static bool getConvInt8Parameters(const MNN::Op* op, std::shared_ptr& quanCommon, Backend* backend, const int8_t*& weight, int& weightSize, float*& scale, int32_t*& bias, int32_t*& weightQuantZero); // Return padX, padY diff --git a/source/core/FileLoader.hpp b/source/core/FileLoader.hpp index 70fdddfd0..46e8036b8 100644 --- a/source/core/FileLoader.hpp +++ b/source/core/FileLoader.hpp @@ -13,14 +13,22 @@ #include "core/AutoStorage.h" namespace MNN { -class MNN_PUBLIC FileLoader { + +class BaseLoader { +public: + BaseLoader() = default; + virtual ~BaseLoader() = default; + virtual bool read(char* buffer, int64_t size) = 0; +}; + +class MNN_PUBLIC FileLoader : public BaseLoader { public: FileLoader(const char* file, bool init = false); ~FileLoader(); bool read(); - + static bool write(const char* filePath, std::pair cacheInfo); bool valid() const { @@ -29,6 +37,9 @@ class MNN_PUBLIC FileLoader { inline size_t size() const { return mTotalSize; } + inline std::string path() const { + return mFilePath; + } bool merge(AutoStorage& buffer); @@ -44,5 +55,17 @@ class MNN_PUBLIC FileLoader { std::string mFilePath; bool mInited = false; }; + +class MemoryLoader : public BaseLoader { +public: + MemoryLoader(unsigned char* ptr) : buffer_(ptr) {} + virtual bool read(char *dst, int64_t size) override { + ::memcpy(dst, buffer_, size); + buffer_ += size; + return true; + } +private: + unsigned char* buffer_ = nullptr; +}; } // namespace MNN #endif diff --git a/source/core/IDSTDecoder.hpp b/source/core/IDSTDecoder.hpp index 679e92fcc..757fdbf4d 100644 --- a/source/core/IDSTDecoder.hpp +++ b/source/core/IDSTDecoder.hpp @@ -12,6 +12,7 @@ #include #include #include "MNN_generated.h" +#include "core/FileLoader.hpp" #include "core/ConvolutionCommon.hpp" using namespace MNN; @@ -22,9 +23,9 @@ static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) { return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT); } -static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) { - int uSize = myfile[0]; - myfile++; +static int ReadBlobDim(BaseLoader* myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) { + uint8_t uSize = 0; + myfile->read((char*)&uSize, 1); if (uSize > 4) { printf("Read shape error!\n"); return 0; @@ -34,14 +35,13 @@ static int ReadBlobDim(unsigned char *&myfile, unsigned int* shape, int shapeBuf copyLength = shapeBufCnt; } if (useInt32) { - ::memcpy(shape, myfile, sizeof(unsigned int) * copyLength); - myfile += copyLength * sizeof(unsigned int); + myfile->read((char*)shape, sizeof(unsigned int) * copyLength); } else { - auto myfileint16 = (uint16_t*)myfile; - for (int i=0; iread((char*)shape_i16, sizeof(uint16_t) * copyLength); + for (int i = 0; i < copyLength; ++i) { + shape[i] = shape_i16[i]; } - myfile += copyLength * sizeof(unsigned short); } return copyLength; } @@ -188,11 +188,6 @@ static int8_t FindInMap(PSIMPLE_MAP map, int8_t k, int *found) { return 0; } -static void StreamSizeRead(void *dst, int unit, size_t count, unsigned char *&file) { - ::memcpy(dst, file, unit * count); - file += (unit * count); -} - static bool isLinearSample(const std::vector& sample, int bit) { const int offset = 1 << (bit - 1); const int size = 1 << bit; @@ -207,16 +202,16 @@ static bool isLinearSample(const std::vector& sample, int bit) { return true; } -static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32) { +static int8_t *ReadQuanData_c(BaseLoader* s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32, bool forceQuant) { int8_t *blob = nullptr; uint8_t *idxBuf = nullptr; uint8_t *idxBytes = nullptr; - uint32_t dataCnt = 1; + size_t dataCnt = 1; do { // blob shape unsigned int shape[32] = {0}; - uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32); + uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32); if (shapeDim == 0 || shapeDim > 32) break; for (uint32_t i = 0; i < shapeDim; i++) @@ -224,7 +219,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: // sample uint32_t sampleCnt = 0; - StreamSizeRead(&sampleCnt, 1, 1, s); + s->read((char*)&sampleCnt, 1); if (sampleCnt == 0) { sampleCnt = 256; } @@ -232,7 +227,7 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: auto samples = result->weightMap.data(); if (samples == nullptr) break; - StreamSizeRead(samples, 1, sampleCnt, s); + s->read((char*)samples, sampleCnt); SimpleRank(samples, sampleCnt, 1); uint32_t idxBitsCnt = atLestBitsCnt(sampleCnt); idxBitsCnt = idxBitsCnt < 1 ? 1 : idxBitsCnt; @@ -243,18 +238,16 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: MNN_ERROR("Not enought memory\n"); break; } - StreamSizeRead(idxBuf, 1, idxBufSize, s); + s->read((char*)idxBuf, idxBufSize); if (idxBitsCnt == 4) { dataCnt = UP_DIV(dataCnt, 2) * 2; } - blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt); - if (nullptr == blob) { - break; - } if (isLinearSample(result->weightMap, idxBitsCnt) && (idxBitsCnt == 4 || idxBitsCnt == 8)) { - // fast sample for bit = 4 or 8 - if (idxBitsCnt == 4) { + if (!forceQuant && idxBitsCnt == 4) { + // back to float, 4bit to 8bit + *len = dataCnt; + blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt); for (int i = 0; i < idxBufSize; i++) { int val = idxBuf[i]; int x1 = val / 16; @@ -262,14 +255,24 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: blob[2 * i] = x1 - 8; blob[2 * i + 1] = x2 - 8; } - } - if (idxBitsCnt == 8) { - for (int i = 0; i < idxBufSize; i++) { - int val = idxBuf[i]; - blob[i] = val - 128; + } else { + // keep quant + blob = (int8_t*)idxBuf; + idxBuf = nullptr; + if (idxBitsCnt == 4) { + result->canUseInt4 = true; + } else { + for (int i = 0; i < idxBufSize; i++) { + blob[i] = (int)blob[i] - 128; + } } + *len = idxBufSize; } } else { + blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt); + if (nullptr == blob) { + break; + } // split index value into bytes idxBytes = (uint8_t *)MNNMemoryAllocAlignZeroAlign(dataCnt * sizeof(uint8_t)); if (idxBitsCnt == 0 || nullptr == idxBytes) { @@ -292,6 +295,8 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: } MNNMemoryFreeAlign(idxBytes); idxBytes = nullptr; + if (len) + *len = blob ? dataCnt : 0; } } while (0); @@ -299,12 +304,11 @@ static int8_t *ReadQuanData_c(unsigned char *&s, size_t* len, ConvolutionCommon: MNNMemoryFreeAlign(idxBuf); if (idxBytes != nullptr) MNNMemoryFreeAlign(idxBytes); - if (len) - *len = blob ? dataCnt : 0; + return blob; } -static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) { // MNN_ERROR("sparse:%d\n", 1); +static int8_t *ReadSparseQuanData_c(BaseLoader* myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) { // MNN_ERROR("sparse:%d\n", 1); unsigned int shape[32]; uint32_t ucMapSize = 0; PSIMPLE_SET setWeight = CreateSimpleSet(256); @@ -324,9 +328,9 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f if (blob == nullptr) return nullptr; // 2. nnz - StreamSizeRead(&nnz, 4, 1, myfile); + myfile->read((char *)&nnz, 4); // 3. max_step use # bits () (unsigned char) - StreamSizeRead(&iIdxNeedBits, 1, 1, myfile); + myfile->read((char *)&iIdxNeedBits, 1); // read idx array // 4. buf for steps ceil(nnz*step need bits/8) AutoStorage arrIdxBuffer(nnz); @@ -340,12 +344,12 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f if (nullptr == buf) { return nullptr; } - StreamSizeRead(buf, 1, bufLen, myfile); + myfile->read((char *)buf, bufLen); SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrIdx, (uint32_t)nnz, (uint32_t)iIdxNeedBits); MNNMemoryFreeAlign(buf); } // 5. Avalable values Count(unsigned char) - StreamSizeRead(&ucMapSize, 1, 1, myfile); + myfile->read((char *)&ucMapSize, 1); if (0 == ucMapSize) { ucMapSize = 256; } @@ -353,7 +357,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f // 6. valueset(signed char * valueset_size) for (int i = 0; i < ucMapSize; i++) { int8_t tmp; - StreamSizeRead(&tmp, 1, 1, myfile); + myfile->read((char *)&tmp, 1); InsertSimpleSet(setWeight, tmp); result->weightMap[i] = tmp; } @@ -383,7 +387,7 @@ static int8_t *ReadSparseQuanData_c(unsigned char *&myfile, size_t* len, const f if (nullptr == buf) { return nullptr; } - StreamSizeRead(buf, 1, bufLen, myfile); + myfile->read((char *)buf, bufLen); SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrWeightIdx, (uint32_t)nnz, (uint32_t)iDataNeedBits); MNNMemoryFreeAlign(buf); diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp index 127bd6e52..6620e0045 100644 --- a/source/core/Interpreter.cpp +++ b/source/core/Interpreter.cpp @@ -41,7 +41,7 @@ struct Content { std::string uuid; std::string externalFile; #ifdef MNN_INTERNAL_ENABLED - std::map basicLogginData; + std::string version; std::map> sessionInfo; #endif }; @@ -221,8 +221,7 @@ Interpreter::Interpreter(Content* net) { mNet->bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : ""); mNet->uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : ""); #ifdef MNN_INTERNAL_ENABLED - mNet->basicLogginData = logBasicInfo(); - mNet->basicLogginData.emplace("ModelVersion", getModelVersion()); + mNet->version = getModelVersion(); #endif } @@ -329,7 +328,8 @@ Session* Interpreter::createMultiPathSession(const std::vector& int mode = configs[0].mode; mNet->sessionInfo.insert(std::make_pair(result, std::make_tuple(precision, mode))); if (shouldLog(FREQ_HIGH)) { - std::map metrics = mNet->basicLogginData; + std::map metrics = logBasicInfo(); + metrics.emplace("ModelVersion", mNet->version); metrics.emplace("UUID", mNet->uuid); metrics.emplace("Time", std::to_string((float)_timer.durationInUs() / 1024.0f)); metrics.emplace("Backend", std::to_string(configs[0].type)); @@ -383,7 +383,8 @@ void Interpreter::logForRunSession(const Session* session, float timeInMs, const session->getInfo(MNN::Interpreter::FLOPS, &flops); float memory = 0.0f; session->getInfo(MNN::Interpreter::MEMORY, &memory); - std::map metrics = mNet->basicLogginData; + std::map metrics = logBasicInfo(); + metrics.emplace("ModelVersion", mNet->version); metrics.emplace("UUID", mNet->uuid); metrics.emplace("Backend", std::to_string(backendType[0])); // "Precision" is not logged here. Don't need it. metrics.emplace("Time", std::to_string(timeInMs)); diff --git a/source/core/MNNFileUtils.cpp b/source/core/MNNFileUtils.cpp new file mode 100644 index 000000000..445ec2bb3 --- /dev/null +++ b/source/core/MNNFileUtils.cpp @@ -0,0 +1,284 @@ +// +// MNNFileUtils.cpp +// MNN +// +// Created by MNN on 2024/07/25. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "MNNFileUtils.h" + +std::string MNNFilePathConcat(std::string prefix, std::string suffix) { +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + return prefix + "\\" + suffix; +#else + return prefix + "/" + suffix; +#endif +} + +bool MNNDirExist(const char * path) { +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + DWORD file_attributes = GetFileAttributes(path); + return (file_attributes != INVALID_FILE_ATTRIBUTES) && (file_attributes & FILE_ATTRIBUTE_DIRECTORY); +#else + struct stat info; + return (stat(path, &info) == 0) && (info.st_mode & S_IFDIR); +#endif +} + +bool MNNFileExist(const char * file_name) +{ +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + return _access(file_name, 0) == 0; +#else + return access(file_name, F_OK) == 0; +#endif +} + +file_t MNNCreateFile(const char * file_name) +{ +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + HANDLE hd = CreateFile( + file_name, // File Name + GENERIC_READ | GENERIC_WRITE, // Read and Write + 0, // No Sharing + NULL, // No Security + CREATE_ALWAYS, // Create the file and cover the existing file + FILE_ATTRIBUTE_NORMAL, // Normal Attribute + NULL // No Template + ); + if (hd == INVALID_HANDLE_VALUE) { + printf("Failed to create the file: %s\n", file_name); + return INVALID_FILE; + } + return hd; +#else + int fd = open( + file_name, // File Name + O_RDWR | O_CREAT | O_TRUNC, // Read and Write and Create the file and cover existing file + 0666 // Read and Write Permission for Everyone + ); + if (fd == -1) { + printf("Failed to create the file: %s\n", file_name); + return INVALID_FILE; + } + return fd; +#endif +} + +file_t MNNOpenFile(const char * file_name, uint32_t flags) +{ + if (!MNNFileExist(file_name)) { + return INVALID_FILE; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + DWORD mode = 0; + if (flags & MNN_FILE_READ) { + mode |= GENERIC_READ; + } + if (flags & MNN_FILE_WRITE) { + mode |= GENERIC_WRITE; + } + HANDLE hd = CreateFile( + file_name, // File Name + mode, // Opening Mode + 0, // No Sharing + NULL, // No Security + OPEN_EXISTING, // Only Open Existing File + FILE_ATTRIBUTE_NORMAL, // Normal Attribute + NULL // No Template + ); + if (hd == INVALID_HANDLE_VALUE) { + printf("Failed to open the file: %s\n", file_name); + return INVALID_FILE; + } + return hd; +#else + int mode = 0; + if (flags & MNN_FILE_READ) { + mode = O_RDONLY; + } + if (flags & MNN_FILE_WRITE) { + mode = O_RDWR; + } + int fd = open(file_name, mode); + if (fd == -1) { + printf("Failed to open the file: %s\n", file_name); + return INVALID_FILE; + } + return fd; +#endif +} + +ErrorCode MNNCloseFile(file_t file) +{ + if (file == INVALID_FILE) { + return FILE_NOT_EXIST; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + if (!CloseHandle(file)) { + return FILE_CLOSE_FAILED; + } +#else + if (-1 == close(file)) { + return FILE_CLOSE_FAILED; + } +#endif + return NO_ERROR; +} + +ErrorCode MNNRemoveFile(const char * file_name) +{ + if (!MNNFileExist(file_name)) { + return FILE_NOT_EXIST; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + if (!DeleteFile(file_name)) { + return FILE_REMOVE_FAILED; + } +#else + if (-1 == unlink(file_name)) { + return FILE_REMOVE_FAILED; + } +#endif + return NO_ERROR; +} + +size_t MNNGetFileSize(file_t file) +{ + if (file == INVALID_FILE) { + return INVALID_SIZE; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + LARGE_INTEGER fileSize; + if (!GetFileSizeEx(file, &fileSize)) { + return (size_t)(-1); + } else { + return (size_t)(fileSize.QuadPart); + } +#else + struct stat file_stat; + if (fstat(file, &file_stat) == -1) { + return (size_t)(-1); + } else { + return file_stat.st_size; + } +#endif +} + +ErrorCode MNNSetFileSize(file_t file, size_t aimed_size) +{ + if (file == INVALID_FILE) { + return FILE_NOT_EXIST; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + LARGE_INTEGER size; + size.QuadPart = aimed_size; + bool success = SetFilePointerEx(file, size, NULL, FILE_BEGIN); + if (!success) { + return FILE_RESIZE_FAILED; + } + success = SetEndOfFile(file); + if (!success) { + return FILE_RESIZE_FAILED; + } + return NO_ERROR; +#else + if (-1 == ftruncate(file, aimed_size)) { + return FILE_RESIZE_FAILED; + } + return NO_ERROR; +#endif +} + +size_t MNNReadFile(file_t file, void * buf, size_t bytes) +{ + if (file == INVALID_FILE || buf == nullptr) { + return 0; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + DWORD readbytes = 0; + if (ReadFile(file, buf, bytes, &readbytes, NULL)) { + return readbytes; + } else { + return 0; + } +#else + return read(file, buf, bytes); +#endif +} + +size_t MNNWriteFile(file_t file, void * buf, size_t bytes) +{ + if (file == INVALID_FILE || buf == nullptr) { + return 0; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + DWORD writebytes = 0; + if (WriteFile(file, buf, bytes, &writebytes, NULL)) { + return writebytes; + } else { + return 0; + } +#else + return write(file, buf, bytes); +#endif +} + +ErrorCode MNNSetFilePointer(file_t file, size_t offset) +{ + if (file == INVALID_FILE) { + return FILE_NOT_EXIST; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + LARGE_INTEGER liDistanceToMove; + liDistanceToMove.QuadPart = offset; + if (SetFilePointerEx(file, liDistanceToMove, NULL, FILE_BEGIN)) { + return NO_ERROR; + } else { + return FILE_SEEK_FAILED; + } +#else + if (-1 == lseek(file, offset, SEEK_SET)) { + return FILE_SEEK_FAILED; + } else { + return NO_ERROR; + } +#endif +} + +void * MNNMmapFile(file_t file, size_t size) +{ + if (file == INVALID_FILE || MNNGetFileSize(file) < size) { + return nullptr; + } +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + HANDLE hFileMapping = CreateFileMapping(file, NULL, PAGE_READWRITE, (size >> 32) & 0xffffffff, size & 0xffffffff, NULL); + if (hFileMapping == NULL) { + return nullptr; + } + void * addr = MapViewOfFile(hFileMapping, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, size); + CloseHandle(hFileMapping); + return addr; +#else + void * addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, file, 0); + if (addr == MAP_FAILED) { + return nullptr; + } + return addr; +#endif +} + +ErrorCode MNNUnmapFile(void * addr, size_t size) +{ +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + if (!UnmapViewOfFile(addr)) { + return FILE_UNMAP_FAILED; + } +#else + if (-1 == munmap(addr, size)) { + return FILE_UNMAP_FAILED; + } +#endif + return NO_ERROR; +} \ No newline at end of file diff --git a/source/core/MNNFileUtils.h b/source/core/MNNFileUtils.h new file mode 100644 index 000000000..a3ecb4be8 --- /dev/null +++ b/source/core/MNNFileUtils.h @@ -0,0 +1,182 @@ +// +// MNNFileUtils.h +// MNN +// +// Created by MNN on 2024/07/25. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef MNN_FileUtils_H +#define MNN_FileUtils_H + +#include +#include +#include +#include "core/Macro.h" +#include "MNN/ErrorCode.hpp" +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) +#include +#include +#undef max +#undef min +#undef NO_ERROR +#else +#include +#include +#include +#include +#endif + +using namespace MNN; + +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) + typedef HANDLE file_t; + const file_t INVALID_FILE = INVALID_HANDLE_VALUE; +#else + typedef int file_t; + const file_t INVALID_FILE = -1; +#endif + +#define MNN_FILE_READ 1U +#define MNN_FILE_WRITE 2U +#define INVALID_SIZE ((size_t)(-1)) + +/*============================================================================================= +** @brief Concat a file name with a directory path +** @hint This function can be called multiple times to concat multi-level paths +*/ +MNN_PUBLIC std::string MNNFilePathConcat(std::string prefix, std::string suffix); + +/*============================================================================================= +** @brief Check whether a directory exists +** @param path -- path of the directory +** @return If the directory exists, returns true +** If the directory does not exist, return false +*/ +MNN_PUBLIC bool MNNDirExist(const char * path); + +/*============================================================================================= +** @brief Check whether a file exists +** @param file_name -- path of the file +** @return If the file exists, returns true +** If the file does not exist, return false +*/ +MNN_PUBLIC bool MNNFileExist(const char * file_name); + +/*============================================================================================= +** @brief Create a file +** @param file_name -- path of the file +** @return If succeeded, returns the handle of the created file in the read and write mode +** If failed, returns INVALID_FILE +** @warning If the file exists already, it will be covered +** Size of the newly created file will be 0 +*/ +MNN_PUBLIC file_t MNNCreateFile(const char * file_name); + +/*============================================================================================= +** @brief Open a file +** @param file_name -- path of the file +** flags -- openning mode (MNN_FILE_READ or MNN_FILE_WRITE or both) +** @return If succeeded, returns the handle of the file +** If failed, returns INVALID_FILE +** @warning If the file does not exist, this function would fail +** Make sure that the aimed file has been created by MNNCreateFile() +*/ +MNN_PUBLIC file_t MNNOpenFile(const char * file_name, uint32_t flags); + +/*============================================================================================= +** @brief Close a file +** @param file -- handle of the file +** @return If succeeded, returns NO_ERROR +** If failed, returns FAILED +** @warning Close an INVALID_FILE would fail +** Make sure that the aimed file has been opened by MNNOpenFile() +*/ +MNN_PUBLIC ErrorCode MNNCloseFile(file_t file); + +/*============================================================================================= +** @brief Remove a file +** @param file_name -- path of the file +** @return If succeeded, returns NO_ERROR +** If failed, returns FAILED +** @warning If the file does not exist, this function would fail +*/ +MNN_PUBLIC ErrorCode MNNRemoveFile(const char * file_name); + +/*============================================================================================= +** @brief Get the size of a file +** @param file -- handle of the file +** @return size of the file or INVALID_SIZE for INVALID_FILE +*/ +MNN_PUBLIC size_t MNNGetFileSize(file_t file); + +/*============================================================================================= +** @brief Resize a file +** @param file -- handle of the file +** aimed_size -- the aimed size of this file +** @return If succeeded, returns NO_ERROR +** If failed, returns FAILED +** @warning resize an INVALID_FILE would fail +*/ +MNN_PUBLIC ErrorCode MNNSetFileSize(file_t file, size_t aimed_size); + +/*============================================================================================= +** @brief Read from the file to the buffer +** @param file -- handle of the file +** buf -- start address of the buffer in memory +** bytes -- number of bytes to be read +** @return how many bytes have been read actually +** @warning Make sure that space of the buffer is enough +** Otherwise, this function may access out of bounds +*/ +MNN_PUBLIC size_t MNNReadFile(file_t file, void * buf, size_t bytes); + +/*============================================================================================= +** @brief Write to the file from the buffer +** @param file -- handle of the file +** buf -- start address of the buffer in memory +** bytes -- number of bytes to be written +** @return how many bytes have been written actually +** @warning Make sure the data in the buffer is enough +** Otherwise, this function may access out of bounds +*/ +MNN_PUBLIC size_t MNNWriteFile(file_t file, void * buf, size_t bytes); + +/*============================================================================================= +** @brief Set the file pointer to a given position +** @param file -- handle of the file +** offset -- the aimed postion from the start of the file +** @return If succeeded, returns NO_ERROR +** If failed, returns FAILED +** @warning Make sure the offset not exceeding the file size +*/ +MNN_PUBLIC ErrorCode MNNSetFilePointer(file_t file, size_t offset); + +/*============================================================================================= +** @brief Memory-map the file to the virtual address space of the current process +** @param file -- handle of the file +** size -- mapped length +** @return If succeeded, returns the start address of the mapped space +** If failed, return nullptr +** @hint Memory-mapping a file to the virtual address space enables the process to access it by pointers +** After the memory-mapping, the user can simply treat the mapped space as a memory buffer +** Read from or write to the mapped space will trigger data swapping +** between the file on disk and the kernel page cache in memory +** which is managed by the OS kernel and is transparent to the user +** @warning Make sure that the mapped size is no larger than the size of the file +** Especially when mapping a newly created file, whose size is 0 +*/ +MNN_PUBLIC void * MNNMmapFile(file_t file, size_t size); + +/*============================================================================================= +** @brief Unmap a previously mapped memory space +** @param addr -- start address of the mapped space +** size -- mapped length +** @return If succeeded, returns NO_ERROR +** If failed, returns FAILED +** @warning Make sure that this space was mapped by the MNNMmapFile() before +** and the size is correct +*/ +MNN_PUBLIC ErrorCode MNNUnmapFile(void * addr, size_t size); + +#endif // MNN_FileUtils_H \ No newline at end of file diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp index f5e385605..8c5596312 100644 --- a/source/core/OpCommonUtils.cpp +++ b/source/core/OpCommonUtils.cpp @@ -619,8 +619,8 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat { auto layer_norm_param = op->main.AsLayerNorm(); int32_t size = static_cast(layer_norm_param->external[1]); - layer_norm_param->gamma.resize(size); - layer_norm_param->beta.resize(size); + layer_norm_param->gamma.resize(size / sizeof(float)); + layer_norm_param->beta.resize(size / sizeof(float)); external->offset(layer_norm_param->external[0]); external->read((char*)layer_norm_param->gamma.data(), layer_norm_param->external[1]); external->read((char*)layer_norm_param->beta.data(), layer_norm_param->external[2]); @@ -631,13 +631,21 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat { auto param = op->main.AsConvolution2D(); if (param->quanParameter) { - external->offset(param->external[0]); - if (0 != param->external[1]) { - param->quanParameter->buffer.resize(param->external[1]); - external->read((char*)param->quanParameter->buffer.data(), param->external[1]); + bool isSparse = param->sparseParameter.get() != nullptr; + bool isPTQ = param->quanParameter->scaleIn != 0; + if (isSparse || isPTQ) { + external->offset(param->external[0]); + if (0 != param->external[1]) { + param->quanParameter->buffer.resize(param->external[1]); + external->read((char*)param->quanParameter->buffer.data(), param->external[1]); + } + param->quanParameter->alpha.resize(param->external[2] / sizeof(float)); + external->read((char*)param->quanParameter->alpha.data(), param->external[2]); + } else { + // skip weight and dequant alpha for load speed + op->externalPath = external->path(); + external->offset(param->external[0] + param->external[1] + param->external[2]); } - param->quanParameter->alpha.resize(param->external[2] / sizeof(float)); - external->read((char*)param->quanParameter->alpha.data(), param->external[2]); if (param->bias.empty() && param->external.size() > 3) { param->bias.resize(param->external[3]/sizeof(float)); external->read((char*)param->bias.data(), param->external[3]); diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp index 2798239e5..553b964e2 100644 --- a/source/core/Pipeline.cpp +++ b/source/core/Pipeline.cpp @@ -270,6 +270,7 @@ ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) { } else { #ifndef MNN_BUILD_MINI mContext.clear(); + mContext.mNeedRelease = mGeometryNeedRelease; FileLoader l(mExternalFile.c_str()); /** Size Compute and compute Const Begin */ auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(&l, mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen); @@ -877,12 +878,17 @@ void Pipeline::_recycleDynamicMemory(Command* command) { } } void Pipeline::openResizeCheck() { +#ifndef MNN_BUILD_MINI + mGeometryNeedRelease = false; for (auto& info : mInfo.second) { info.computeCache.open(); } +#endif } ErrorCode Pipeline::fixResizeCache() { +#ifndef MNN_BUILD_MINI + // TODO: Recompute release mask and set mGeometryNeedRelease = true for (auto& info : mInfo.second) { if (info.type == Schedule::CONSTANT && (!info.computeCache.needExecuteConst)) { info.executeBuffer.command.clear(); @@ -895,6 +901,7 @@ ErrorCode Pipeline::fixResizeCache() { res = res && mInfo.first.cache.second->onSelectDynamicAllocator(1, 2); if (!res) { MNN_PRINT("%d backend don't support resize fix optimize\n", mInfo.first.cache.first->type()); + mGeometryNeedRelease = true; return NOT_SUPPORT; } size_t totalNumber = 0; @@ -946,6 +953,7 @@ ErrorCode Pipeline::fixResizeCache() { mInfo.first.cache.first->onSelectDynamicAllocator(0, 2); res && mInfo.first.cache.second->onSelectDynamicAllocator(0, 2); MNN_PRINT("Fix: %d - Total: %d, rate = %f\n", fixNumber, totalNumber, (float)fixNumber / (float)totalNumber); +#endif return NO_ERROR; } ErrorCode Pipeline::_allocForTensor(int index, bool allocInput) { @@ -1070,28 +1078,6 @@ ErrorCode Pipeline::_allocForTensor(int index, bool allocInput) { ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) { // MNN_PRINT("allocMemory mtype:%d, cpubackendType:%d, cpuBackend runtime:%p\n", mBackend->type(), mBackupBackend->type(), mBackupBackend->getRuntime()); if (!firstMalloc) { - // For session setNeedMalloc, if session's output is set as some input, It may cause error - // Dup des to avoid it - for (auto& info : mInfo.second) { - auto& buffer = info.executeBuffer; - for (const auto& infoP : buffer.command) { - auto& info = *infoP; - for (auto t : info.workOutputs) { - if (!TensorUtils::getDescribe(t)->isMutable) { - continue; - } - auto des = TensorUtils::getDescribe(t); - auto usage = des->usage; - if (TensorUtils::getDescribeOrigin(t)->mContent.use_count() > 1 && usage != Tensor::InsideDescribe::CONSTANT) { - TensorUtils::getDescribeOrigin(t)->mem = nullptr; - auto res = TensorUtils::getDescribeOrigin(t)->getBackend()->onAcquireBuffer(t, Backend::STATIC); - if (!res) { - return OUT_OF_MEMORY; - } - } - } - } - } if (OpCommonUtils::supportDynamicInputMemory(mInfo.first.cache.first->type()) && (!mInfo.first.inputBackendChange)) { return NO_ERROR; } diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp index 6fb9543d3..c3611fe59 100644 --- a/source/core/Pipeline.hpp +++ b/source/core/Pipeline.hpp @@ -81,6 +81,7 @@ class Pipeline : public NonCopyable { #ifndef MNN_BUILD_MINI GeometryComputer::Context mContext; Runtime::CompilerType mUseGeometry; + bool mGeometryNeedRelease = true; #endif const Runtime* mRuntime; const Runtime* mCpuRuntime; diff --git a/source/core/Schedule.cpp b/source/core/Schedule.cpp index 475708567..63065596d 100644 --- a/source/core/Schedule.cpp +++ b/source/core/Schedule.cpp @@ -142,9 +142,13 @@ MNNForwardType Schedule::getApprociateType(const ScheduleConfig& config) { Backend::Info info; info.type = type; std::shared_ptr bn(creator->onCreate(info)); - bool isSupportLowPower = bn->onGetRuntimeStatus(RuntimeStatus::STATUS_SUPPORT_POWER_LOW); - if(!isSupportLowPower) { - MNN_PRINT("type=%d backend don't Support Low Power, use %d instead\n", type, config.backupType); + if (nullptr != bn.get()) { + bool isSupportLowPower = bn->onGetRuntimeStatus(RuntimeStatus::STATUS_SUPPORT_POWER_LOW); + if(!isSupportLowPower) { + MNN_PRINT("type=%d backend don't Support Low Power, use %d instead\n", type, config.backupType); + type = config.backupType; + } + } else{ type = config.backupType; } } diff --git a/source/core/Schedule.hpp b/source/core/Schedule.hpp index a13b213f5..1065cfd82 100644 --- a/source/core/Schedule.hpp +++ b/source/core/Schedule.hpp @@ -78,6 +78,9 @@ class MNN_PUBLIC Schedule { std::map> executionCache; OpResizeCache computeCache; + + /** For CONSTANT info, can release indexes after resize*/ + std::vector releaseAbleInputs; }; // Backend, Tensor, shape-dirty, content-dirty diff --git a/source/core/Session.cpp b/source/core/Session.cpp index 9b27d5e1f..a424898ba 100644 --- a/source/core/Session.cpp +++ b/source/core/Session.cpp @@ -87,10 +87,24 @@ void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) { case Interpreter::KVCACHE_QUANT_OPTIONS: runtimeHint.kvcacheQuantOption = hint; break; + case Interpreter::KVCACHE_SIZE_LIMIT: + runtimeHint.kvcacheSizeLimit = hint; + break; default: break; } } + +void Session::ModeGroup::setExternalPath(std::string path, int type) { + switch (type) { + case MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR: + runtimeHint.kvcacheDirPath = path; + break; + default: + break; + } +} + Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime) { mMode = mode; mRuntime = std::move(runtime); @@ -251,18 +265,10 @@ ErrorCode Session::resize() { } } if(mMemoryUsageMode == Interpreter::Session_Memory_Collect) { - #ifdef LOG_VERBOSE - float memory = 0.0f; - #endif + mRuntime.second->onGabageCollect(0); for (auto& iter : mRuntime.first) { iter.second->onGabageCollect(0); - #ifdef LOG_VERBOSE - memory += iter.second->onGetMemoryInMB(); - #endif } - #ifdef LOG_VERBOSE - FUNC_PRINT_ALL(memory, f); - #endif } mNeedMalloc = false; mNeedResize = false; @@ -428,13 +434,14 @@ ErrorCode Session::updateToModel(Net* net) const { static void initTensors(std::vector>& tensors, const std::vector>& tensorSrc) { for (int i=0; iindex = i; } - } - for (int i = 0; i < tensors.size(); ++i) { auto srcDes = TensorUtils::getDescribe(tensorSrc[i].get()); if (srcDes->quantAttr != nullptr) { TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr); diff --git a/source/core/Session.hpp b/source/core/Session.hpp index c753a6c51..8f7415ebd 100644 --- a/source/core/Session.hpp +++ b/source/core/Session.hpp @@ -39,6 +39,7 @@ class MNN_PUBLIC Session { RuntimeHint runtimeHint; void setHint(Interpreter::HintMode hint, int magic); void setMode(Interpreter::SessionMode mode); + void setExternalPath(std::string path, int type); }; Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeInfo&& runtime); diff --git a/source/geometry/GeometryComputer.hpp b/source/geometry/GeometryComputer.hpp index f826e4d99..f85a49a6f 100644 --- a/source/geometry/GeometryComputer.hpp +++ b/source/geometry/GeometryComputer.hpp @@ -45,6 +45,7 @@ class GeometryComputer { return mMask & option; } std::shared_ptr mRasterOp; + bool mNeedRelease = true; private: void getRasterCacheCreate(Tensor* src, CommandBuffer& cmd); std::map>> mConstTensors; diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp index fc76622ab..207d29e5d 100644 --- a/source/geometry/GeometryComputerUtils.cpp +++ b/source/geometry/GeometryComputerUtils.cpp @@ -150,6 +150,7 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform( bool openCache = geoContext.support(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_OPENCACHE); /** Size Compute and compute Const Begin */ GeometryComputer::Context ctx(Interpreter::GeometryComputeMask::GEOMETRCOMPUTEMASK_ALL, backupBackend); + bool needRelease = geoContext.mNeedRelease; // Size Compute and compute Const for (int i=0; imem = nullptr; + } + } } /** Size Compute and compute Const End */ diff --git a/source/geometry/GeometryConv2DBackPropFilter.cpp b/source/geometry/GeometryConv2DBackPropFilter.cpp index f542da4ea..3277442d7 100644 --- a/source/geometry/GeometryConv2DBackPropFilter.cpp +++ b/source/geometry/GeometryConv2DBackPropFilter.cpp @@ -76,7 +76,7 @@ class GeometryConv2DBackPropFilter : public GeometryComputer { endDx = endDx - (endSx - iw + sw) / sw; endSx = endDx * sw + kx * dw - pads.first; } - if (startDy > endDy) { + if (startDy > endDy || startDx > endDx) { continue; } auto dstOffsetKx = dstOffsetKy + startDx; diff --git a/source/geometry/GeometryReverseSequence.cpp b/source/geometry/GeometryReverseSequence.cpp index f63a6e7e9..addcc0339 100644 --- a/source/geometry/GeometryReverseSequence.cpp +++ b/source/geometry/GeometryReverseSequence.cpp @@ -26,7 +26,7 @@ class GeometryReverseSequence : public GeometryComputer { MNN_ERROR("Dont's has Parameters for OpType_ReverseSequence\n"); return false; } - auto seqDim = op->main_as_ReverseSequenceParam()->seqDim(); + auto seqDim = op->main_as_ReverseSequenceParam()->seqDim(); // time_axis for ONNX if (seqDim < 0) { seqDim += inputs[0]->dimensions(); } @@ -82,7 +82,7 @@ class GeometryReverseSequence : public GeometryComputer { outputDes->regions.clear(); for (int batch = 0; batch < batchSize; ++batch) { - auto q = reverse->host()[batch]; + int q = reverse->host()[batch]; if (q > input->length(seqDim) || q < 1) { MNN_ERROR("ReverseSequence info error\n"); return false; diff --git a/source/shape/SizeComputer.cpp b/source/shape/SizeComputer.cpp index 3783aba65..61f449fcf 100644 --- a/source/shape/SizeComputer.cpp +++ b/source/shape/SizeComputer.cpp @@ -208,7 +208,7 @@ std::vector SizeComputer::needInputContent(const MNN::Op* op, int inputSize return std::vector{ inputSize - 1 }; } } - if (inputSize > 1 && (op->type() == OpType_Squeeze || op->type() == OpType_Unsqueeze)) { + if (inputSize > 1 && (op->type() == OpType_Squeeze || op->type() == OpType_Unsqueeze || op->type() == OpType_ReverseSequence || op->type() == OpType_Reverse)) { return std::vector{1}; } if (op->type() == OpType_CumSum) { diff --git a/source/utils/InitNet.cpp b/source/utils/InitNet.cpp index 534812f96..2c196851c 100644 --- a/source/utils/InitNet.cpp +++ b/source/utils/InitNet.cpp @@ -111,33 +111,49 @@ bool initConstTensors(std::vector>& tensors, const Net* return valid; } -bool initTensors(std::vector>& tensors, const Net* net) { +static void _createTensor(std::shared_ptr& dst, int index) { + if (dst.get() == nullptr) { + dst.reset(new Tensor); + TensorUtils::getDescribe(dst.get())->index = index; + } +} +bool initTensors(std::vector>& tensors, const Net* net, const int* oplists, size_t opListSize) { bool valid = true; auto describes = net->extraTensorDescribe(); - std::vector des(tensors.size()); - for (int i=0; iindex = i; - // MNN_PRINT("initTensors create tensor:%p, index:%d, backend:%d\n", tensors[i].get(), i, TensorUtils::getDescribe(tensors[i].get())->backend); + if (nullptr != oplists) { + for (int i=0; ioplists()->GetAs(oplists[i]); + if (nullptr != op->inputIndexes()) { + for (int v=0; vinputIndexes()->size(); ++v) { + auto index = op->inputIndexes()->data()[v]; + _createTensor(tensors[index], index); + } + } + if (nullptr != op->outputIndexes()) { + for (int v=0; voutputIndexes()->size(); ++v) { + auto index = op->outputIndexes()->data()[v]; + _createTensor(tensors[index], index); + } + } + } + } else { + for (int i=0; isize(); i++) { - int index = describes->GetAs(i)->index(); - des[index] = describes->GetAs(i); - } - } - for (int i = 0; i < tensors.size(); ++i) { - if (des[i] != nullptr && des[i]->quantInfo()) { - TensorUtils::getDescribe(tensors[i].get())->quantAttr.reset(new QuantAttr); - auto quant = TensorUtils::getDescribe(tensors[i].get())->quantAttr.get(); - quant->scale = des[i]->quantInfo()->scale(); - quant->zero = des[i]->quantInfo()->zero(); - quant->min = des[i]->quantInfo()->min(); - quant->max = des[i]->quantInfo()->max(); - // Don't copy datatype, it can be set by backend + auto des = describes->GetAs(i); + int index = des->index(); + if (tensors[index].get() != nullptr && des->quantInfo()) { + TensorUtils::getDescribe(tensors[index].get())->quantAttr.reset(new QuantAttr); + auto quant = TensorUtils::getDescribe(tensors[index].get())->quantAttr.get(); + quant->scale = des->quantInfo()->scale(); + quant->zero = des->quantInfo()->zero(); + quant->min = des->quantInfo()->min(); + quant->max = des->quantInfo()->max(); + } } } // Set Input Tensor, if the type of input is not the same with ExtraTensorDescribe, use input parameter @@ -147,6 +163,9 @@ bool initTensors(std::vector>& tensors, const Net* net) MNN_ASSERT(nullptr != op->outputIndexes()); MNN_ASSERT(op->outputIndexes()->size() == 1); auto index = op->outputIndexes()->data()[0]; + if (tensors[index].get() == nullptr) { + continue; + } auto tensor = tensors[index].get(); auto& tb = tensor->buffer(); auto inputParam = op->main_as_Input(); @@ -175,17 +194,16 @@ bool initTensors(std::vector>& tensors, const Net* net) return valid; } // static model will set all tensors' shape - for (int i = 0; i < describes->size(); i++) { - int index = describes->GetAs(i)->index(); - des[index] = describes->GetAs(i); - } - for (int i = 0; i < tensors.size(); ++i) { - if (TensorUtils::getDescribe(tensors[i].get())->usage != Tensor::InsideDescribe::NORMAL) { + for (int v = 0; v < describes->size(); v++) { + auto des = describes->GetAs(v); + int index = des->index(); + auto tensorDes = TensorUtils::getDescribe(tensors[index].get()); + if (tensorDes->usage != Tensor::InsideDescribe::NORMAL) { // Const / Trainable Shape has been inited continue; } - auto blob = des[i]->blob(); - auto& tb = tensors[i]->buffer(); + auto blob = des->blob(); + auto& tb = tensors[index]->buffer(); if (auto idims = blob->dims()) { for (int d = 0; d < idims->size(); d++) { tb.dim[d].extent = idims->Get(d); @@ -194,14 +212,12 @@ bool initTensors(std::vector>& tensors, const Net* net) } else { tb.dimensions = 0; } - tensors[i]->setType(blob->dataType()); - } - for (int i = 0; i < tensors.size(); ++i) { - auto blob = des[i]->blob(); - TensorUtils::getDescribe(tensors[i].get())->dimensionFormat = blob->dataFormat(); - if (auto regions = des[i]->regions()) { - auto& regs = TensorUtils::getDescribe(tensors[i].get())->regions; - TensorUtils::getDescribe(tensors[i].get())->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND; + tensors[index]->setType(blob->dataType()); + tensorDes->dimensionFormat = blob->dataFormat(); + if (auto regions = des->regions()) { + auto& regs = tensorDes->regions; + tensorDes->memoryType = Tensor::InsideDescribe::MEMORY_BACKEND; + regs.clear(); regs.reserve(regions->size()); for (int r = 0; r < regions->size(); r++) { auto region = regions->GetAs(r); diff --git a/source/utils/InitNet.hpp b/source/utils/InitNet.hpp index 6bdcc34c0..e045ae0b5 100644 --- a/source/utils/InitNet.hpp +++ b/source/utils/InitNet.hpp @@ -18,7 +18,7 @@ MNN_PUBLIC bool computeShapeForBlob(const Blob* parameter, Tensor* output); MNN_PUBLIC bool initConstTensors(std::vector>& tensors, const Net* net, Backend* defaultBackend, ErrorCode& code, FileLoader* external); // init Tensors by net -MNN_PUBLIC bool initTensors(std::vector>& allTensors, const Net* net); +MNN_PUBLIC bool initTensors(std::vector>& allTensors, const Net* net, const int* oplists = nullptr, size_t opListSize = 0); // init Pipeline Infos by oplist and tensors MNN_PUBLIC void initPipelineInfosFromOps(std::vector& infos, std::vector& ops, const std::vector>& allTensors); // set input and output for allTensors by ops info diff --git a/test.sh b/test.sh index ef4edd95b..81ef7c647 100755 --- a/test.sh +++ b/test.sh @@ -77,8 +77,8 @@ doc_check() { # 1.2 check executable for executable in $executables do - if [ $(grep -c $executable ./docs/compile/tools.md) -le 0 ]; then - echo 'DOC CHECK FAILED:' $executable 'not in ./docs/compile/tools.md' + if [ $(grep -c $executable ./docs/compile/other.md) -le 0 ]; then + echo 'DOC CHECK FAILED:' $executable 'not in ./docs/compile/other.md' failed fi done @@ -117,6 +117,7 @@ doc_check() { } py_check() { + echo 'py_check' if [ -z "$PY_CHANGE" ]; then return fi @@ -133,6 +134,7 @@ py_check() { } static_check() { + echo 'static_check' if [ -z "$SOURCE_CHANGE" ]; then return fi @@ -310,18 +312,6 @@ onnx_convert_test() { echo '### ONNXConvert测试失败,测试终止!' failed fi - if [ -f ~/AliNNModel/TestOnnx/ops/run.py ]; then - ~/AliNNModel/TestOnnx/ops/run.py --mnndir $(pwd) --aone-mode - if [ $? -ne 0 ]; then - echo '### Onnx单线程单元测试失败,测试终止!' - failed - fi - #~/AliNNModel/TestOnnx/ops/run.py --mnndir $(pwd) --aone-mode --thread_num 2 - #if [ $? -ne 0 ]; then - # echo '### ONNX多线程单元测试失败,测试终止!' - # failed - #fi - fi } tf_convert_test() { @@ -525,7 +515,7 @@ android_model_test() { pass_num=0 fail_cl_num=0 pass_cl_num=0 - models=`ls ~/AliNNModel/OpTestResource/` + models=`adb shell ls /data/local/tmp/AliNNModel/OpTestResource/` for model in $models do adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/OpTestResource/$model/temp.bin ../AliNNModel/OpTestResource/$model/input_0.txt ../AliNNModel/OpTestResource/$model/output_0.txt 0 0.002" @@ -544,7 +534,7 @@ android_model_test() { fi done - models=`ls ~/AliNNModel/TestResource/` + models=`adb shell ls /data/local/tmp/AliNNModel/TestResource/` for model in $models do if [ $model == 'mobilenetv1quan' ]; then @@ -560,9 +550,9 @@ android_model_test() { if [ "$OPENCL_CHANGE" ]; then if [ $model == 'mobilenetv1quan' ]; then adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/TestResource/$model/temp.bin ../AliNNModel/TestResource/$model/input_0.txt ../AliNNModel/TestResource/$model/output.txt 3 0.1 1" - else + else adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModel.out ../AliNNModel/TestResource/$model/temp.bin ../AliNNModel/TestResource/$model/input_0.txt ../AliNNModel/TestResource/$model/output.txt 3 0.002 1" - fi + fi if [ $? -ne 0 ]; then fail_cl_num=$[$fail_cl_num+1] else @@ -571,7 +561,7 @@ android_model_test() { fi done - models=`ls ~/AliNNModel/TestWithDescribe/` + models=`adb shell ls /data/local/tmp/AliNNModel/TestWithDescribe/` for model in $models do adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./testModelWithDescribe.out ../AliNNModel/TestWithDescribe/$model/temp.bin ../AliNNModel/TestWithDescribe/$model/config.txt 0 0.002" @@ -703,6 +693,11 @@ case "$1" in android_static_build android_test ;; + static) + doc_check + static_check + py_check + ;; *) $1 echo $"Usage: $0 {local|linux|android|func}" diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp index 2b3aeb52a..f37f1c038 100644 --- a/test/MNNTestSuite.cpp +++ b/test/MNNTestSuite.cpp @@ -33,6 +33,7 @@ void MNNTestSuite::add(MNNTestCase* test, const char* name) { static void printTestResult(int wrong, int right, const char* flag) { MNN_PRINT("TEST_NAME_UNIT%s: 单元测试%s\nTEST_CASE_AMOUNT_UNIT%s: ", flag, flag, flag); MNN_PRINT("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right); + MNN_PRINT("TEST_CASE={\"name\":\"单元测试%s\",\"failed\":%d,\"passed\":%d}\n", flag, wrong, right); } int MNNTestSuite::run(const char* key, int precision, const char* flag) { diff --git a/test/core/FileUtilsTest.cpp b/test/core/FileUtilsTest.cpp new file mode 100644 index 000000000..5a40bdce4 --- /dev/null +++ b/test/core/FileUtilsTest.cpp @@ -0,0 +1,320 @@ +// +// FileUtilsTest.cpp +// MNNTests +// +// Created by MNN on 2024/07/26. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "MNNTestSuite.h" +#include "core/MNNFileUtils.h" + +#if defined(WIN32) || defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER) +const char * file_path = "C:\\Windows\\Temp\\file_utils_test_temp_file"; +#elif defined(__ANDROID__) +const char * file_path = "/data/local/tmp/file_utils_test_temp_file"; +#elif defined(__APPLE__) || defined(__linux__) || defined(__unix__) +const char * file_path = "/tmp/file_utils_test_temp_file"; +#else +const char * file_path = "./file_utils_test_temp_file"; +#endif + +class FileUtilsTest : public MNNTestCase { +public: + virtual ~FileUtilsTest() = default; + virtual bool run(int precision) { + /*======== Create and Remove ========*/ + do { + // create a new file + file_t file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } else { + MNNCloseFile(file); + } + bool exist = MNNFileExist(file_path); + if (!exist) { + return false; + } + // create a new file to cover an existing file + file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } else { + MNNCloseFile(file); + } + exist = MNNFileExist(file_path); + if (!exist) { + return false; + } + // remove a file + MNN::ErrorCode ec = MNNRemoveFile(file_path); + if (ec != NO_ERROR) { + return false; + } + exist = MNNFileExist(file_path); + if (exist) { + return false; + } + printf("File Utils Test: Create and Remove passed\n"); + } while(false); + + /*======== Open and Close ========*/ + do { + // Open and close a non-existent file + file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE); + if (file != INVALID_FILE) { + return false; + } + MNN::ErrorCode ec = MNNCloseFile(file); + if (ec != FILE_NOT_EXIST) { + return false; + } + // Open and close an existent file + file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } + bool exist = MNNFileExist(file_path); + if (!exist) { + return false; + } + ec = MNNCloseFile(file); + if (ec != NO_ERROR) { + return false; + } + file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE); + if (file == INVALID_FILE) { + return false; + } + ec = MNNCloseFile(file); + if (ec != NO_ERROR) { + return false; + } + ec = MNNRemoveFile(file_path); + if (ec != NO_ERROR) { + return false; + } + exist = MNNFileExist(file_path); + if (exist) { + return false; + } + printf("File Utils Test: Open and Close passed\n"); + } while(false); + + /*======== Get and Set File Size ========*/ + do { + file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE); + if (file != INVALID_FILE) { + return false; + } + size_t size = MNNGetFileSize(file); + if (size != INVALID_SIZE) { + printf("File size mismatch: expected %lu but got %lu\n", INVALID_SIZE, size); + return false; + } + file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } + size = MNNGetFileSize(file); + if (size != 0) { + printf("File size mismatch: expected 0 but got %lu\n", size); + return false; + } + size_t expectedSize = 1023; + MNN::ErrorCode ec = MNNSetFileSize(file, expectedSize); + if (ec != NO_ERROR) { + return false; + } + size = MNNGetFileSize(file); + if (size != expectedSize) { + printf("File size mismatch: expected %lu but got %lu\n", expectedSize, size); + return false; + } + expectedSize = 64 * 1024 * 1024; + ec = MNNSetFileSize(file, expectedSize); + if (ec != NO_ERROR) { + return false; + } + size = MNNGetFileSize(file); + if (size != expectedSize) { + printf("File size mismatch: expected %lu but got %lu\n", expectedSize, size); + return false; + } + ec = MNNCloseFile(file); + if (ec != NO_ERROR) { + return false; + } + ec = MNNRemoveFile(file_path); + if (ec != NO_ERROR) { + return false; + } + bool exist = MNNFileExist(file_path); + if (exist) { + return false; + } + printf("File Utils Test: Get and Set File Size passed\n"); + } while(false); + + /*======== Read and Write ========*/ + do { + char alpha[27] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + size_t size = 32; + char * buf = (char *)malloc(size); + if (buf == nullptr) { + printf("MNN_FAILED to allocate memory in File Utils Test!\n"); + return false; + } + file_t file = MNNOpenFile(file_path, MNN_FILE_READ | MNN_FILE_WRITE); + if (file != INVALID_FILE) { + return false; + } + size_t ret = MNNReadFile(file, nullptr, 0); + if (ret != 0) { + return false; + } + ret = MNNWriteFile(file, nullptr, 0); + if (ret != 0) { + return false; + } + file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } + ret = MNNReadFile(file, buf, 10); + if (ret != 0) { + return false; + } + ret = MNNWriteFile(file, alpha, 26); + if (ret != 26) { + return false; + } + MNN::ErrorCode ec = MNNSetFilePointer(file, 0); + if (ec != NO_ERROR) { + return false; + } + ret = MNNReadFile(file, buf, 20); + if (ret != 20) { + return false; + } + ret = MNNReadFile(file, buf, 10); + if (ret != 6) { + return false; + } + ec = MNNSetFilePointer(file, 0); + if (ec != NO_ERROR) { + return false; + } + ret = MNNReadFile(file, buf, 3); + if (ret != 3) { + return false; + } + if (buf[0] != 'A' || buf[1] != 'B' || buf[2] != 'C') { + return false; + } + ec = MNNSetFilePointer(file, 20); + if (ec != NO_ERROR) { + return false; + } + ret = MNNReadFile(file, buf, 3); + if (ret != 3) { + return false; + } + if (buf[0] != 'U' || buf[1] != 'V' || buf[2] != 'W') { + return false; + } + ec = MNNSetFilePointer(file, 10); + if (ec != NO_ERROR) { + return false; + } + char hello[6] = "hello"; + ret = MNNWriteFile(file, (void *)hello, 6); + if (ret != 6) { + return false; + } + ec = MNNSetFilePointer(file, 10); + if (ec != NO_ERROR) { + return false; + } + ret = MNNReadFile(file, buf, 6); + if (0 != strcmp(buf, "hello")) { + return false; + } + ec = MNNCloseFile(file); + if (ec != NO_ERROR) { + return false; + } + ec = MNNRemoveFile(file_path); + if (ec != NO_ERROR) { + return false; + } + bool exist = MNNFileExist(file_path); + if (exist) { + return false; + } + free(buf); + printf("File Utils Test: Read and Write passed\n"); + } while(false); + + /*======== Map and Unmap ========*/ + do { + char * addr = (char *)MNNMmapFile(INVALID_FILE, INVALID_SIZE); + if (addr != nullptr) { + return false; + } + MNN::ErrorCode ec = MNNUnmapFile(addr, 0); + if (ec != FILE_UNMAP_FAILED) { + return false; + } + file_t file = MNNCreateFile(file_path); + if (file == INVALID_FILE) { + return false; + } + addr = (char *)MNNMmapFile(file, 1024); + if (addr != nullptr) { + return false; + } + ec = MNNSetFileSize(file, 1024); + if (ec != NO_ERROR) { + return false; + } + addr = (char *)MNNMmapFile(file, 1024); + if (addr == nullptr) { + return false; + } + strcpy(addr, "hello"); + ec = MNNUnmapFile(addr, 1024); + if (ec != NO_ERROR) { + return false; + } + addr = (char *)MNNMmapFile(file, 1024); + if (addr == nullptr) { + return false; + } + if(0 != strcmp(addr, "hello")) { + return false; + } + ec = MNNUnmapFile(addr, 1024); + if (ec != NO_ERROR) { + return false; + } + ec = MNNCloseFile(file); + if (ec != NO_ERROR) { + return false; + } + ec = MNNRemoveFile(file_path); + if (ec != NO_ERROR) { + return false; + } + bool exist = MNNFileExist(file_path); + if (exist) { + return false; + } + printf("File Utils Test: Map and Unmap passed\n"); + } while(false); + + return true; + } +}; +MNNTestSuiteRegister(FileUtilsTest, "core/file_utils"); diff --git a/test/core/IDSTTest.cpp b/test/core/IDSTTest.cpp index 62e620d60..bd4cf136b 100644 --- a/test/core/IDSTTest.cpp +++ b/test/core/IDSTTest.cpp @@ -22,14 +22,18 @@ class IDSTTest : public MNNTestCase { std::vector quantWeight(kernelNum * kernelSize, 0); // IDST encode std::unique_ptr idstQuantT = IDSTEncoder::encode(weight.data(), scale, kernelSize, kernelNum, false, quantWeight.data(), -127); - std::unique_ptr conv2dT(new Convolution2DT); + Convolution2DT* conv2dT = new Convolution2DT; + std::unique_ptr opT(new OpT); conv2dT->quanParameter = std::move(idstQuantT); + opT->type = OpType_Convolution; + opT->main.type = OpParameter_Convolution2D; + opT->main.value = conv2dT; flatbuffers::FlatBufferBuilder builder; - auto lastOffset = Convolution2D::Pack(builder, conv2dT.get()); + auto lastOffset = Op::Pack(builder, opT.get()); builder.Finish(lastOffset); - auto conv2d = flatbuffers::GetRoot(builder.GetBufferPointer()); + auto op = flatbuffers::GetRoot(builder.GetBufferPointer()); // IDST decode - std::shared_ptr common = ConvolutionCommon::load(conv2d); + std::shared_ptr common = ConvolutionCommon::load(op); // is input == output ? bool res = (0 == memcmp(common->weightFloat.get(), weight.data(), weight.size())); return res; diff --git a/test/expr/MemoryIncrease.cpp b/test/expr/MemoryIncrease.cpp index 90f089261..11f7fbc54 100644 --- a/test/expr/MemoryIncrease.cpp +++ b/test/expr/MemoryIncrease.cpp @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include "MNNTestSuite.h" #include "MNN_generated.h" @@ -205,3 +207,76 @@ class MidOutputTest : public MNNTestCase { } }; MNNTestSuiteRegister(MidOutputTest, "expr/MidOutputTest"); + +class ConstFoldMemoryTest : public MNNTestCase { +public: + virtual bool run(int precision) { + BackendConfig bnConfig; + auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1); + ExecutorScope scope(exe); + Module::Config config; + config.shapeMutable = true; + config.rearrange = true; + std::vector buffer; + { + // Make Buffer + auto x0 = _Input({1}, NCHW, halide_type_of()); + x0->setName("x0"); + auto x1 = _Const(1.0f, {256, 1024}, NCHW); + x1 = x1 * x1 * _Cos(x1) * _Sin(x1); + auto y0 = x0 * x1; + y0->setName("y0"); + buffer = Variable::save({y0}); + } + auto rtInfo = Express::ExecutorScope::Current()->getRuntime(); + auto rt = rtInfo.first.begin()->second; + MNN::ScheduleConfig sconfig; + std::vector sconfigs = {sconfig}; + std::shared_ptr rtMgr(Executor::RuntimeManager::createRuntimeManager(sconfigs)); + rtMgr->setMode(Interpreter::Session_Memory_Collect); + std::shared_ptr m0(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy); + std::shared_ptr m1(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy); + float memoryInit = 0.0f; + rtMgr->getInfo(Interpreter::MEMORY, &memoryInit); + FUNC_PRINT_ALL(memoryInit, f); + auto x = _Input({1}, NCHW, halide_type_of()); + x->writeMap(); + x->unMap(); + float memoryCurrent = 0.0f; + auto compute = [&](){ + m0->onForward({x}); + rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent); + auto static0 = memoryCurrent - memoryInit; + FUNC_PRINT_ALL(static0, f); + if (static0 > 2.1f) { + MNN_ERROR("Constant folder Memory too large\n"); + return false; + } + memoryInit = memoryCurrent; + m1->traceOrOptimize(Interpreter::Session_Resize_Check); + m1->onForward({x}); + rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent); + auto static1 = memoryCurrent - memoryInit; + FUNC_PRINT_ALL(static1, f); + if (static1 <= static0) { + MNN_ERROR("Check mod the memory should be larger than init mode\n"); + return false; + } + m1->traceOrOptimize(Interpreter::Session_Resize_Fix); + m1->onForward({x}); + rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent); + auto static2 = memoryCurrent - memoryInit; + FUNC_PRINT_ALL(static2, f); + if (static2 >= static1) { + MNN_ERROR("TODO: Fix mod the memory should be less than check mode\n"); + } + return true; + }; + bool res = compute(); + if (!res) { + return false; + } + return true; + } +}; +MNNTestSuiteRegister(ConstFoldMemoryTest, "expr/ConstFoldMemoryTest"); diff --git a/test/expr/ModuleShapeInfer.cpp b/test/expr/ModuleShapeInfer.cpp new file mode 100644 index 000000000..5d122c32e --- /dev/null +++ b/test/expr/ModuleShapeInfer.cpp @@ -0,0 +1,108 @@ +#include +#include +#include +#include "MNNTestSuite.h" +using namespace MNN; +using namespace MNN::Express; + +class ModuleShapeInfer : public MNNTestCase { +public: + static float _reduceSum(const float* zPtr, int size) { + float summer = 0.0f; + for (int i=0; i empty; + // Make Net + auto x = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); + x->setName("x"); + auto y = x * x; + VARP starts; + VARP sizes; + { + std::vector sta = {0, 0, 1, 1}; + std::vector siz = {1, 1, 1, 1}; + starts = _Const(sta.data(), {4}, NCHW, halide_type_of()); + sizes = _Const(siz.data(), {4}, NCHW, halide_type_of()); + } + auto z = _Slice(y, starts, sizes); + z->setName("z"); + auto buffer = Variable::save({z}); + ScheduleConfig config; + BackendConfig bnConfig; + bnConfig.precision = MNN::BackendConfig::Precision_Low; + config.backendConfig = &bnConfig; + std::shared_ptr rt(Executor::RuntimeManager::createRuntimeManager(config), Executor::RuntimeManager::destroy); + std::shared_ptr net0(Module::load({"x"}, {"z"}, (const uint8_t*)buffer.data(), buffer.size(), rt), Module::destroy); + std::shared_ptr net1(Module::load({"x"}, {"z"}, (const uint8_t*)buffer.data(), buffer.size(), rt), Module::destroy); + x = _Input({1, 3, 2, 2}, NCHW, halide_type_of()); + // Run Init Value + auto inputPtr = x->writeMap(); + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = i; + } + y = net0->onForward({x})[0]; + auto yPtr = y->readMap(); + auto ySize = y->getInfo()->size; + auto valueFirst = _reduceSum(yPtr, ySize); + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = x->getInfo()->size - i; + } + y = net0->onForward({x})[0]; + yPtr = y->readMap(); + auto valueSecond = _reduceSum(yPtr, ySize); + + // Shape Infer mode + auto code = net1->traceOrOptimize(Interpreter::Module_Forward_Separate); + if (0 != code) { + FUNC_PRINT(1); + return false; + } + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = i; + } + y = net1->onForward({x})[0]; + yPtr = y->readMap(); + auto tmp = net1->onForward(empty); + if (tmp.size() > 0) { + FUNC_PRINT(1); + return false; + } + if (_reduceSum(yPtr, ySize) != valueFirst) { + FUNC_PRINT(1); + return false; + } + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = x->getInfo()->size - i; + } + net1->onForward(empty); + if (_reduceSum(yPtr, ySize) != valueSecond) { + FUNC_PRINT(1); + return false; + } + net1->traceOrOptimize(MNN::Interpreter::Module_Forward_Combine); + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = i; + } + y = net1->onForward({x})[0]; + yPtr = y->readMap(); + if(_reduceSum(yPtr, ySize) != valueFirst) { + FUNC_PRINT(1); + return false; + } + for (int i=0; igetInfo()->size; ++i) { + inputPtr[i] = x->getInfo()->size - i; + } + y = net1->onForward({x})[0]; + yPtr = y->readMap(); + if(_reduceSum(yPtr, ySize) != valueSecond) { + FUNC_PRINT(1); + return false; + } + return true; + } +}; +MNNTestSuiteRegister(ModuleShapeInfer, "expr/ModuleShapeInfer"); diff --git a/test/expr/ReverseSequenceTest.cpp b/test/expr/ReverseSequenceTest.cpp index e93fef9df..7a9f7c9ff 100644 --- a/test/expr/ReverseSequenceTest.cpp +++ b/test/expr/ReverseSequenceTest.cpp @@ -15,6 +15,7 @@ class ReverseSequenceTest : public MNNTestCase { public: virtual bool run(int precision) { // high dimension, batch_dim ahead + { auto y = _Input({4}, NHWC, halide_type_of()); std::vector seq = {7, 2, 3, 5}; @@ -59,6 +60,7 @@ class ReverseSequenceTest : public MNNTestCase { } if (!func_equal(need, compute)) { + MNN_PRINT("case 1 error\n"); return false; } } @@ -66,7 +68,28 @@ class ReverseSequenceTest : public MNNTestCase { } } } - return true; + } + + { // test SizeComputer::needInputContent + int dim0 = 1, dim1 = 6, dim2 = 7, dim3 = 10, dim4 = 8; + auto x = _Input({dim0, dim1, dim2, dim3, dim4}, NHWC, halide_type_of()); + auto x_transpose = _Transpose(x, {1, 0, 2, 3, 4}); + auto x_shape = _Shape(x_transpose, NHWC); + int ii[]= {1}; + auto x_gather = _Gather(x_shape, _Const(ii, {1}, NCHW, halide_type_of())); + auto ry = _ReverseSequence(x_transpose, x_gather, 1, 3); + auto xPtr = x->writeMap(); + + for (int i = 0; i < dim0 * dim1 * dim2 * dim3 * dim4; ++i) { + xPtr[i] = 1; + } + + auto ryPtr = ry->readMap(); + + if (ryPtr == nullptr) { + MNN_PRINT("case 2 error\n"); + return false; + } } // high dimension, seq_dim ahead @@ -113,6 +136,7 @@ class ReverseSequenceTest : public MNNTestCase { need = 10000 * o + 1000 * (req - i - 1) + 100 * m + 10 * j + k; } if (!func_equal(need, compute)) { + MNN_PRINT("case 3 error\n"); return false; } } @@ -120,7 +144,6 @@ class ReverseSequenceTest : public MNNTestCase { } } } - return true; } // 3 dimension @@ -160,13 +183,14 @@ class ReverseSequenceTest : public MNNTestCase { need = 100 * (req - i - 1) + 10 * j + k; } if (!func_equal(need, compute)) { + MNN_PRINT("case 4 error\n"); return false; } } } } - return true; } + return true; } }; MNNTestSuiteRegister(ReverseSequenceTest, "expr/ReverseSequence"); diff --git a/test/grad/BinaryGradTest.cpp b/test/grad/BinaryGradTest.cpp index 7ea5cc995..79f84d181 100644 --- a/test/grad/BinaryGradTest.cpp +++ b/test/grad/BinaryGradTest.cpp @@ -17,6 +17,9 @@ using namespace MNN::Express; class BinaryGradTest : public MNNTestCase { public: + BinaryGradTest() { + OpGrad::init(); + } char name[20] = "Binary"; virtual ~BinaryGradTest() = default; diff --git a/test/grad/GridSampleGradTest.cpp b/test/grad/GridSampleGradTest.cpp index d5f7afd28..5ff8131ca 100644 --- a/test/grad/GridSampleGradTest.cpp +++ b/test/grad/GridSampleGradTest.cpp @@ -134,7 +134,8 @@ class GridSampleGradTest : public MNNTestCase { 1.9181, 2.3750, 1.2852, 3.8511, 2.2257, 3.3546, 1.7295, 2.3564, 1.4813, 1.2510, 3.0876, 2.1284, 2.1088, 3.0961, 2.2002, 3.6899, 2.5827, 4.1795, 2.8591, 1.4046, 1.2500, 3.0877, 3.2670, 3.5806, 2.8717, 2.8829, 1.6387}; - auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap(); + auto tmpgotOutput = _Convert(inputGrad[0], NCHW); + auto gotOutput = tmpgotOutput->readMap(); for (int i = 0; i < inputLen; ++i) { auto diff = ::fabsf(gotOutput[i] - expectedOutput[i]); diff --git a/test/grad/PReLUGradTest.cpp b/test/grad/PReLUGradTest.cpp index ee003fba0..1ce712c54 100644 --- a/test/grad/PReLUGradTest.cpp +++ b/test/grad/PReLUGradTest.cpp @@ -36,7 +36,8 @@ class PReLUGradTest : public MNNTestCase { auto inputGrad = grad->onGrad(opExpr, {_Convert(outputDiffVar, NC4HW4)}); const std::vector expectedOutput = {0.025, -0.1, 0.09, 0.4, 0.05}; - auto gotOutput = _Convert(inputGrad[0], NCHW)->readMap(); + auto gotOutputVar = _Convert(inputGrad[0], NCHW); + auto gotOutput = gotOutputVar->readMap(); for (int i = 0; i < len; ++i) { auto diff = ::fabsf(gotOutput[i] - expectedOutput[i]); diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp index 428a37d72..3b9d94856 100644 --- a/test/op/ConvInt8Test.cpp +++ b/test/op/ConvInt8Test.cpp @@ -290,6 +290,7 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon { public: virtual bool run(int precision) { + return true; std::vector> kernels = { {4, 2}, {1, 5}, {7, 1} }; diff --git a/test/op/ResizeTest.cpp b/test/op/ResizeTest.cpp index cc0380f4b..9a6ac5ef4 100644 --- a/test/op/ResizeTest.cpp +++ b/test/op/ResizeTest.cpp @@ -102,7 +102,7 @@ class InterpTest : public MNNTestCase { return false; } } - + //Interp Type:3 { auto output = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 3, false); @@ -145,7 +145,7 @@ class InterpInt8Test : public MNNTestCase { auto scaleVar = _Const((void*)scales, {4}, NCHW); int outW = int(wScale * 2); int outH = int(hScale * 2); - + //Interp Type:1 { printf("InterpInt8 test: Type=1\n"); @@ -190,7 +190,7 @@ class InterpInt8Test : public MNNTestCase { return false; } } - + // Interp Type:3 { printf("InterpInt8 test: Type=3\n"); diff --git a/test/op/ReverseTest.cpp b/test/op/ReverseTest.cpp index 8220944ef..5dc4ea3de 100644 --- a/test/op/ReverseTest.cpp +++ b/test/op/ReverseTest.cpp @@ -120,6 +120,28 @@ class ReverseTest : public MNNTestCase { } } } + + { // test SizeComputer::needInputContent + int dim0 = 1, dim1 = 6, dim2 = 7, dim3 = 10, dim4 = 8; + auto x = _Input({dim0, dim1, dim2, dim3, dim4}, NHWC, halide_type_of()); + auto x_transpose = _Transpose(x, {1, 0, 2, 3, 4}); + auto x_shape = _Shape(x_transpose, NHWC); + int ii[]= {1}; + auto x_gather = _Gather(x_shape, _Const(ii, {1}, NCHW, halide_type_of())); + auto ry = _Reverse(x_transpose, x_gather); + auto xPtr = x->writeMap(); + + for (int i = 0; i < dim0 * dim1 * dim2 * dim3 * dim4; ++i) { + xPtr[i] = 1; + } + + auto ryPtr = ry->readMap(); + + if (ryPtr == nullptr) { + MNN_PRINT("reverse case 3 error\n"); + return false; + } + } return true; } }; diff --git a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp index 6138f14a0..998f68844 100644 --- a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp +++ b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp @@ -381,7 +381,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { if (expr->get()->type() != OpType_BinaryOp && expr->get()->type() != OpType_MatMul) { return false; } - if (expr->get()->type() != OpType_BinaryOp && expr->get()->main_as_BinaryOp() && expr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_ADD) { + if (expr->get()->type() == OpType_BinaryOp && expr->get()->main_as_BinaryOp() && expr->get()->main_as_BinaryOp()->opType() != BinaryOpOperation_ADD) { return false; } VARP matmul_var; @@ -395,6 +395,9 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { if (matmul_expr->get() == nullptr) { return false; } + if (expr->inputs().size() > 2) { + return false; + } if (expr->inputs().size() > 1) { bias_var = expr->inputs().at(1); if (matmul_var->expr().first->get() == nullptr || matmul_var->expr().first->get()->type() == OpType_Const) { @@ -403,10 +406,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { matmul_expr = matmul_var->expr().first; } } - if (bias_var->getInfo() == nullptr) { - return false; - } - if (bias_var->expr().first->inputType() == VARP::InputType::INPUT) { + if (matmul_expr->get() == nullptr || matmul_expr->get()->type() != OpType_MatMul ) { return false; } // conv -> reshape -> convert -> add @@ -424,11 +424,17 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { if (matmul_var->linkNumber() > 1) { return false; } + if (bias_var->readMap() == nullptr) { + return false; + } } else { matmul_expr = std::move(expr); if (matmul_expr->inputs().size() != 8 && matmul_expr->inputs().size() != 9) { return false; } + if (nullptr == matmul_expr->get() || matmul_expr->get()->type() != OpType_MatMul) { + return false; + } matmulAddBias = false; } // finish getting matmul_expr @@ -438,9 +444,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { auto input = matmul_expr->inputs().at(0); auto weight = matmul_expr->inputs()[1]; auto weightInfo = weight->getInfo(); - if (nullptr == matmulOp || matmulOp->type() != OpType_MatMul) { - return false; - } + if (nullptr == weightInfo || weightInfo->dim.size() != 2 || weightInfo->type.bits != 8) { return false; } diff --git a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp index 56fa934da..54fff31ae 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp @@ -13,7 +13,7 @@ namespace MNN { namespace Express { template -static EXPRP clipConvert(EXPRP expr) { +static EXPRP clipConvert(EXPRP expr, bool supportRelu6) { auto inputs = expr->inputs(); auto op = expr->get(); auto extraParam = op->main_as_Extra(); @@ -49,7 +49,7 @@ static EXPRP clipConvert(EXPRP expr) { maxValue = maxPtr[0]; } } - if (unknown_min_max) { + if (unknown_min_max || (!supportRelu6)) { auto minVar = _Scalar(minValue); auto maxVar = _Scalar(maxValue); if (inputs.size() >= 2 && inputs[1].get() != nullptr) { @@ -84,18 +84,17 @@ class OnnxClipTransform : public OnnxExtraManager::Transform { public: virtual EXPRP onExecute(EXPRP expr) const override { auto inputs = expr->inputs(); - halide_type_code_t type; + halide_type_code_t type = halide_type_int; for (int i = 0; i < inputs.size(); ++i) { if (nullptr != inputs[i] && nullptr != inputs[i]->getInfo()) { type = static_cast(inputs[i]->getInfo()->type.code); break; } } - if (type == halide_type_float) { - return clipConvert(expr); - } else { - return clipConvert(expr); + if (type == halide_type_float || inputs.size() == 1) { + return clipConvert(expr, true); } + return clipConvert(expr, false); } }; diff --git a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp index 6927e246d..d67760d27 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp @@ -31,7 +31,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform { return nullptr; } - uint8_t dataType = halide_type_int; + auto dataType = halide_type_int; VARP zeropoint = _Const(0.f); if (inputs.size() > 2) { if (inputs[2]->getInfo() == nullptr) { @@ -39,7 +39,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform { } MNN_ASSERT(inputs[2]->getInfo() != nullptr); auto zeroDim = inputs[2]->getInfo()->dim; - dataType = inputs[2]->getInfo()->type.code; + dataType = static_cast(inputs[2]->getInfo()->type.code); std::vector fp32Zero(inputs[2]->getInfo()->size); if (dataType == halide_type_int) { const int8_t* zeroPtr = inputs[2]->readMap(); @@ -60,7 +60,7 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform { std::vector inputDim = {}; if (input->getInfo()) { inputDim = input->getInfo()->dim; - dataType = input->getInfo()->type.code; + dataType = static_cast(input->getInfo()->type.code); } auto offset = _Const(0.f); if (dataType == halide_type_uint) { diff --git a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp index ddbdbc657..d0b66d1b9 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxEinsum.cpp @@ -139,16 +139,22 @@ class OnnxEinsumTransform : public OnnxExtraManager::Transform { } // find reduce dim char reduce_dim; + int reduce_dim_pos = -1; for (int i = 0; i < input0.size(); ++i) { auto c = input0[i]; if (right.find(c) == std::string::npos) { reduce_dim = c; + reduce_dim_pos = i; break; } } + bool needTransposeA = false; + if (reduce_dim_pos >= 0 && input0.size() >= 2 && reduce_dim_pos == input0.size() - 2) { + needTransposeA = true; + } auto need_transpose = input1.find(reduce_dim) == (input1.size() - 1); // matmul: matmul auto broadcast such: `bhwc @ hkc` -> `bhwc @ bhkc` - auto output = _MatMul(var0, var1, false, need_transpose); + auto output = _MatMul(var0, var1, needTransposeA, need_transpose); // squeeze if (sqeeze_axis >= 0) { output = _Squeeze(output, {sqeeze_axis}); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp index c94cfee75..fae1ffb0c 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp @@ -31,12 +31,12 @@ class OnnxQuantizeLinearTransform : public OnnxExtraManager::Transform { MNN_ERROR("QuantizeLinear should provide scale and input\n"); return nullptr; } - uint8_t dataType = halide_type_int; + auto dataType = halide_type_int; VARP zeropoint = _Const(0.f); auto offset = _Const(0.f); if (inputs.size() > 2) { zeropoint = _Cast(inputs[2]); - dataType = inputs[2]->getInfo()->type.code; + dataType = static_cast(inputs[2]->getInfo()->type.code); } if (dataType == halide_type_uint) { offset = _Const(128.f); diff --git a/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp b/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp new file mode 100644 index 000000000..34e8a864d --- /dev/null +++ b/tools/converter/source/optimizer/tflitextra/ConvTranposeTflite.cpp @@ -0,0 +1,52 @@ +// +// ConvTranposeTflite.cpp +// MNNConverter +// +// Created by MNN on 2019/09/27. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "MNN_generated.h" +#include "../../tflite/liteOpConverter.hpp" +#include "TFliteExtraManager.hpp" + +namespace MNN { +namespace Express { + +/*See CustomTflite.cpp for detail attribute*/ +class ConvTranposeTflite : public TFliteExtraManager::Transform { +public: + virtual EXPRP onExecute(EXPRP expr) const override { + auto inputs = expr->inputs(); + auto weight = inputs[1]; + auto bias = inputs[2]; + weight = _Transpose(weight, {3, 0, 1, 2}); + auto weightInfo = weight->getInfo(); + auto biasInfo = bias->getInfo(); + + auto extra = expr->get()->main_as_Extra(); + std::unique_ptr deconvOp(flatbuffers::GetRoot(extra->info()->data())->UnPack()); + auto weightPtr = weight->readMap(); + auto biasPtr = bias->readMap(); + EXPRP newExpr; + if (nullptr == weightPtr || nullptr == biasPtr) { + newExpr = Expr::create(deconvOp.get(), {inputs[0], weight, bias}); + } else { + auto conv = deconvOp->main.AsConvolution2D(); + conv->weight.resize(weightInfo->size); + ::memcpy(conv->weight.data(), weightPtr, weightInfo->size * sizeof(float)); + conv->bias.resize(biasInfo->size); + ::memcpy(conv->bias.data(), biasPtr, biasInfo->size * sizeof(float)); + newExpr = Expr::create(deconvOp.get(), {inputs[0]}); + } + auto newOutput = Variable::create(newExpr); + newOutput->setName(expr->name()); + return newOutput->expr().first; + } +}; +static auto gRegister = []() { + TFliteExtraManager::get()->insert("Convolution2DTransposeBias", std::shared_ptr(new ConvTranposeTflite)); + return true; +}(); +} // namespace Express +} // namespace MNN diff --git a/tools/converter/source/tflite/ConvolutionTflite.cpp b/tools/converter/source/tflite/ConvolutionTflite.cpp index 2b0fa15ef..9e3809f3b 100644 --- a/tools/converter/source/tflite/ConvolutionTflite.cpp +++ b/tools/converter/source/tflite/ConvolutionTflite.cpp @@ -40,6 +40,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr const auto& inputTensor = tfliteTensors[inputIndex]; const auto& weightTensor = tfliteTensors[weightIndex]; const auto& outputTensor = tfliteTensors[outputIndex]; + + auto inputShape = inputTensor->shape; + int group = 1; // co kh kw ci const auto& weightShape = weightTensor->shape; DCHECK(weightShape.size() == 4) << "Conv2D weight ERROR!"; @@ -48,6 +51,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr const int kw = weightShape[2]; const int ci = weightShape[3]; const int weightSize = co * kh * kw * ci; + if (inputShape.size() == 4 && inputShape[3] > ci) { + group = inputShape[3] / ci; + } if (quantizedModel == 1) { // UINT8_QUANT auto conv2dParamQuan = new MNN::TfQuantizedConv2DT; conv2dParamQuan->modelFormat = MNN::ModeFormat_TFLITE; @@ -99,7 +105,7 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr conv2dParamQuan->common->outputCount = co; // default - conv2dParamQuan->common->group = 1; + conv2dParamQuan->common->group = group; conv2dParamQuan->common->dilateX = tfliteConvOption->dilation_w_factor; conv2dParamQuan->common->dilateY = tfliteConvOption->dilation_h_factor; conv2dParamQuan->depthMultiplier = 1; @@ -166,9 +172,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr return; } - common->group = 1; + common->group = group; common->outputCount = co; - common->inputCount = ci; + common->inputCount = ci * group; common->kernelX = kw; common->kernelY = kh; common->dilateX = tfliteConvOption->dilation_w_factor; @@ -242,9 +248,9 @@ void Conv2DTflite::run(MNN::OpT* dstOp, const std::unique_ptr return; } - common->group = 1; + common->group = group; common->outputCount = co; - common->inputCount = ci; + common->inputCount = ci * group; common->kernelX = kw; common->kernelY = kh; common->dilateX = tfliteConvOption->dilation_w_factor; diff --git a/tools/converter/source/tflite/CustomTflite.cpp b/tools/converter/source/tflite/CustomTflite.cpp index b311e82fc..66ac9d324 100644 --- a/tools/converter/source/tflite/CustomTflite.cpp +++ b/tools/converter/source/tflite/CustomTflite.cpp @@ -20,12 +20,86 @@ MNN::OpType CustomTflite::opType(int quantizedModel) { MNN::OpParameter CustomTflite::type(int quantizedModel) { return MNN::OpParameter_DetectionPostProcessParam; } +struct TfLiteTransposeConvParams{ + // Parameters supported by version 1: + int padding = 0; + int stride_width; + int stride_height; + + // Parameters supported by version 4: + int activation = 0; + + // Parameters for TransposeConv version 5 or above. + // Used to determine the default value for the quantized bias. + int quantized_bias_type = 0; +}; + void CustomTflite::run(MNN::OpT *dstOp, const std::unique_ptr &tfliteOp, const std::vector > &tfliteTensors, const std::vector > &tfliteModelBuffer, const std::vector > &tfliteOpSet, int quantizedModel) { auto &customOPCode = tfliteOpSet[tfliteOp->opcode_index]->custom_code; + if (customOPCode == "Convolution2DTransposeBias") { + dstOp->type = MNN::OpType_Deconvolution; + TfLiteTransposeConvParams params; + size_t copyLenth = std::min(sizeof(params), tfliteOp->custom_options.size()); + ::memcpy(¶ms, tfliteOp->custom_options.data(), copyLenth); + dstOp->main.type = MNN::OpParameter_Convolution2D; + dstOp->main.value = new MNN::Convolution2DT; + auto conv = dstOp->main.AsConvolution2D(); + conv->common.reset(new MNN::Convolution2DCommonT); + auto common = conv->common.get(); + common->strideX = params.stride_width; + common->strideY = params.stride_height; + switch (params.padding) { + case 0: + common->padMode = MNN::PadMode_CAFFE; + break; + case 1: + common->padMode = MNN::PadMode_SAME; + break; + case 2: + common->padMode = MNN::PadMode_VALID; + break; + default: + break; + } + const int inputIndex = tfliteOp->inputs[0]; + const int weightIndex = tfliteOp->inputs[1]; + const int biasIndex = tfliteOp->inputs[2]; + const int outputIndex = tfliteOp->outputs[0]; + const auto& inputTensor = tfliteTensors[inputIndex]; + const auto& weightTensor = tfliteTensors[weightIndex]; + const auto& biasTensor = tfliteTensors[biasIndex]; + + const auto& weightShape = weightTensor->shape; + DCHECK(weightShape.size() == 4) << "Conv2D weight ERROR!"; + const int co = weightShape[0]; + const int kh = weightShape[1]; + const int kw = weightShape[2]; + const int ci = weightShape[3]; + + // TODO: Support group + common->group = 1; + common->outputCount = co; + common->inputCount = ci; + common->kernelX = kw; + common->kernelY = kh; + + flatbuffers::FlatBufferBuilder builder; + builder.Finish(MNN::Op::Pack(builder, dstOp)); + dstOp->type = MNN::OpType_Extra; + dstOp->main.Reset(); + dstOp->main.value = new MNN::ExtraT; + dstOp->main.type = MNN::OpParameter_Extra; + auto extra = dstOp->main.AsExtra(); + extra->type = "Convolution2DTransposeBias"; + extra->engine = "Tflite"; + extra->info.resize(builder.GetSize()); + ::memcpy(extra->info.data(), builder.GetBufferPointer(), builder.GetSize()); + return; + } DCHECK(customOPCode == "TFLite_Detection_PostProcess") << "Now Only support Custom op of 'TFLite_Detection_PostProcess'"; diff --git a/tools/cpp/ExprDebug.hpp b/tools/cpp/ExprDebug.hpp index 2b5688d58..280626544 100644 --- a/tools/cpp/ExprDebug.hpp +++ b/tools/cpp/ExprDebug.hpp @@ -61,7 +61,7 @@ static void dumpTensor2File(const MNN::Tensor* tensor, const char* file, std::of } } -std::ofstream gOrderFile; +static std::ofstream gOrderFile; static void _initDebug() { gOrderFile.open("order.txt"); MNN::TensorCallBackWithInfo beforeCallBack = [&](const std::vector& ntensors, const MNN::OperatorInfo* info) { @@ -133,7 +133,7 @@ static void _initDebug() { struct TimeTraceInfo { std::map>>> mTypes; - + void begin(const MNN::OperatorInfo* info) { auto tIter = mTypes.find(info->type()); if (tIter == mTypes.end()) { @@ -191,7 +191,7 @@ std::tuple _countTensor(MNN::Tensor* tensor) { return std::make_tuple(maxValue, minValue, avgValue); } -std::pair> _countForTensorValid(MNN::Tensor* ntensor) { +static std::pair> _countForTensorValid(MNN::Tensor* ntensor) { bool valid = false; std::tuple res; if (ntensor->elementSize() <= 0) { diff --git a/tools/cpp/LoRA.cpp b/tools/cpp/LoRA.cpp index c2fc7137f..558e0fa08 100644 --- a/tools/cpp/LoRA.cpp +++ b/tools/cpp/LoRA.cpp @@ -154,7 +154,8 @@ void LoRA::apply_external(MNN::OpT* op, MNN::OpT* lora_A, MNN::OpT* lora_B) { auto& quan = param->quanParameter; size_t weightLength = 0; auto ptr = reinterpret_cast(result->weight.get()); - auto new_ptr = IDSTDecoder::ReadQuanData_c(ptr, &weightLength, result.get(), quan->shapeInt32); + std::unique_ptr loader(new MemoryLoader(ptr)); + auto new_ptr = IDSTDecoder::ReadQuanData_c(loader.get(), &weightLength, result.get(), quan->shapeInt32, false); result->weight.set(new_ptr, weightLength); result->weightFloat.reset(weightLength); // dequant to float diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp index 4ae33f3d8..9551b7d6f 100644 --- a/tools/quantization/calibration.cpp +++ b/tools/quantization/calibration.cpp @@ -239,7 +239,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int _imageProcessConfig.sourceFormat = RGBA; _calibrationFileNum = 0; - + if (picObj.HasMember("mean")) { auto mean = picObj["mean"].GetArray(); int cur = 0; @@ -351,7 +351,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int _inputType = Helper::InputType::SEQUENCE; } } - + _module.reset(Module::load({}, {}, originalModelFile.c_str())); auto moduleInfo = _module->getInfo(); for (int i = 0; i < moduleInfo->inputNames.size(); ++i) { @@ -405,7 +405,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int } mInputShape.insert(std::make_pair(name, shape)); } - + std::shared_ptr process(ImageProcess::create(_imageProcessConfig), ImageProcess::destroy); _process = process; @@ -432,7 +432,7 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int } } } - + MNN::ScheduleConfig config; config.backupType = MNN_FORWARD_CPU; config.numThread = 1; @@ -558,7 +558,7 @@ void Calibration::_initMaps() { void Calibration::_computeFeatureMapsRange() { // feed input data according to input images int count = 0; - + auto netInfo = _module->getInfo(); for (const auto& file: _calibrationFiles) { std::vector inputs; @@ -568,7 +568,7 @@ void Calibration::_computeFeatureMapsRange() { for (auto& iter : _featureInfo) { iter.second->resetUpdatedRangeFlags(); } - + if (_inputType == Helper::SEQUENCE) { inputs = getModuleInputs(file, netInfo, mInputNames, mInputShape); for (int i = 0; i < inputs.size(); ++i) { @@ -880,11 +880,15 @@ void Calibration::_insertScale() { std::unique_ptr externalWeightTensor, externalBiasTensor; if (nullptr != conv2d->quanParameter.get()) { flatbuffers::FlatBufferBuilder tempBuilder; + /* tempBuilder.Finish(IDSTQuan::Pack(tempBuilder, conv2d->quanParameter.get())); tempBuilder.Finish(Convolution2D::Pack(tempBuilder, conv2d)); auto conv2d = flatbuffers::GetRoot(tempBuilder.GetBufferPointer()); + */ + tempBuilder.Finish(Op::Pack(tempBuilder, op.get())); + auto pack_op = flatbuffers::GetRoot(tempBuilder.GetBufferPointer()); bool forceFloat = true; - quanCommon = ConvolutionCommon::load(conv2d, nullptr, true, true); + quanCommon = ConvolutionCommon::load(pack_op, nullptr, true, true); // Back to float originWeight = quanCommon->weightFloat.get(); originWeightSize = quanCommon->weightFloat.size(); @@ -975,7 +979,7 @@ void Calibration::_computeQuantError() { for (const auto& file : _calibrationFiles) { count++; - + for (auto& iter : _featureInfo) { iter.second->setVisited(false); } @@ -1112,12 +1116,12 @@ void Calibration::_quantizeModelEMA() { } } } - + for (int i = 0; i < inputs.size(); ++i) { auto name = varInputs[i]->name(); auto input = _Input(dyInputShape[name], varInputs[i]->getInfo()->order, varInputs[i]->getInfo()->type); std::string fileName = file + "/" + name + ".txt"; - + auto inputTensor = (MNN::Tensor*)input->getTensor(); Helper::preprocessInput(_process.get(), _preprocessConfig, fileName, inputTensor, _inputType); ::memcpy(input->writeMap(), inputTensor->host(), inputTensor->elementSize() * sizeof(float)); @@ -1128,7 +1132,7 @@ void Calibration::_quantizeModelEMA() { auto inputTensor = (MNN::Tensor*)singleInput->getTensor(); Helper::preprocessInput(_process.get(), _preprocessConfig, file, inputTensor, _inputType); ::memcpy(inputs[0]->writeMap() + k * inputTensor->elementSize(), inputTensor->host(), inputTensor->elementSize() * sizeof(float)); - + } } auto predicts = _module->onForward(inputs); @@ -1151,9 +1155,9 @@ void Calibration::_quantizeModelEMA() { input->setName(name); inputsForward[i] = input; } - + auto predicts = _module->onForward(inputsForward); - + Transformer::turnModelToInfer()->onExecute(predicts); for (int i = 0; i < predicts.size(); i++) { predicts[i]->setName(varOutputs[i]->name()); diff --git a/tools/script/apply_gptq.py b/tools/script/apply_gptq.py index 3f4727f08..d3805a024 100644 --- a/tools/script/apply_gptq.py +++ b/tools/script/apply_gptq.py @@ -3,10 +3,16 @@ import argparse class MNNWeight: - def __init__(self, name, external, a_min): + def __init__(self, name, external, weight_elements): self.name = name self.external = external - self.a_min = a_min + self.quant_bits = 4 + if round(weight_elements / external[1]) == 2: + self.quant_bits = 4 + self.a_min = -8 + else: + self.quant_bits = 8 + self.a_min = -128 self.parse_name() def __repr__(self) -> str: @@ -23,7 +29,9 @@ def parse_name(self): self.op_id = parts[2] self.block_id = parts[-1].split('__')[-1] - def key(self): return f'{self.layer_id}.{self.op_id}' + def key(self): + if self.layer_id == -1: return self.op_id + return f'{self.layer_id}.{self.op_id}' def idx(self): return int(self.block_id) def offset(self): return self.external[0] def weight_size(self): return self.external[1] @@ -38,10 +46,8 @@ def weight_reorder(qweight, bits=4, group_size=128): if bits == 8: weight = weight.to(torch.uint8) return weight - if bits == 4: - weight = weight.reshape(-1, 2).to(torch.uint8) - weight = weight[:, 0] * 16 + weight[:, 1] - return weight + weight = weight.reshape(-1, 2).to(torch.uint8) + weight = weight[:, 0] * 16 + weight[:, 1] return weight class MNNModel: @@ -56,8 +62,8 @@ def parse_conv(self): if op['type'] == 'Convolution': name = op['name'] external = op['main']['external'] - a_min = op['main']['quanParameter']['aMin'] - self.weights.append(MNNWeight(name, external, a_min)) + weight_elements = op['main']['common']['outputCount'] * op['main']['common']['inputCount'] + self.weights.append(MNNWeight(name, external, weight_elements)) def apply_weight_split(self, gptq_tensor): bin_file = open(self.external_weight, 'r+b') @@ -69,7 +75,7 @@ def apply_weight_split(self, gptq_tensor): weight = gptq_weight.weight(idx) scale = gptq_weight.scale(idx).float() # write weight data - weight = weight_reorder(weight, self.quant_bits) + weight = weight_reorder(weight, mnn_weight.quant_bits) weight_bytes = weight.numpy().tobytes() weight_size = mnn_weight.weight_size() header_len = weight_size - len(weight_bytes) @@ -95,10 +101,11 @@ def apply_weight(self, gptq_tensor): gptq_weight = gptq_tensor.get(mnn_weight.key()) if gptq_weight is None: continue print(f'write {mnn_weight.key()} ... ', end='') + # print(f'mnn_weight.quant_bits = {mnn_weight.quant_bits}') weight = gptq_weight.qweight scale = gptq_weight.scales.float().transpose(1, 0) # write weight data - weight = weight_reorder(weight, self.quant_bits) + weight = weight_reorder(weight, mnn_weight.quant_bits) weight_bytes = weight.numpy().tobytes() weight_size = mnn_weight.weight_size() header_len = weight_size - len(weight_bytes) @@ -117,8 +124,7 @@ def apply_weight(self, gptq_tensor): print('Done!') bin_file.close() - def apply(self, gptq_tensor, quant_bits): - self.quant_bits = quant_bits + def apply(self, gptq_tensor): if self.weights[0].block_id.isdigit(): self.apply_weight_split(gptq_tensor) else: @@ -153,6 +159,8 @@ def __init__(self, file): def prefix(self, name): splits = name.split('.') + if 'lm_head' in splits[0] and len(splits) == 2: + return splits[0], splits[1] if len(splits) < 5: return None, None pre = f'{splits[2]}.{splits[3]}.{splits[4]}' @@ -182,13 +190,12 @@ def load(self): def main(args): mnn_model = MNNModel(args.mnn_graph, args.mnn_weight) gptq_weight = GPTQTensor(args.gptq_tensor) - mnn_model.apply(gptq_weight, args.quant_bits) + mnn_model.apply(gptq_weight) if __name__ == '__main__': parser = argparse.ArgumentParser(description='apply_gptq', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--mnn_graph', type=str, required=True, help='mnn graph json path.') parser.add_argument('--mnn_weight', type=str, required=True, help='mnn weight file path.') parser.add_argument('--gptq_tensor', type=str, required=True, help='gptq tensor path.') - parser.add_argument('--quant_bits', type=int, default=4, help='quant bits, default is 4.') args = parser.parse_args() main(args) diff --git a/tools/script/apply_lora.py b/tools/script/apply_lora.py new file mode 100644 index 000000000..e0e03e79f --- /dev/null +++ b/tools/script/apply_lora.py @@ -0,0 +1,156 @@ +import os +import json +import argparse + +class Base: + def __init__(self, path, fuse_lora): + self.fuse_lora = fuse_lora + self.load(path) + + def __str__(self): + return str(self.lora_keys) + + def load(self, path): + self.base_model = json.load(open(path, 'rt')) + + def build_conv(self, input_index, output_name, dims, weight, mul_scale = 1.0): + output_index = len(self.base_model['tensorName']) + oc, ic = dims + bias = [0.0 for i in range(oc)] + if mul_scale != 1.0: + weight = [w * mul_scale for w in weight] + op = { + 'type': 'Convolution', + 'name': output_name, + 'inputIndexes': [input_index], + 'outputIndexes': [ output_index ], + 'main_type': 'Convolution2D', + 'main': { + 'common': { + 'dilateX': 1, 'dilateY': 1, 'strideX': 1, 'strideY': 1, + 'kernelX': 1, 'kernelY': 1, 'padX': 0, 'padY': 0, 'group': 1, + 'outputCount': oc, 'relu': False, 'padMode': 'CAFFE', + 'relu6': False, 'inputCount': ic, 'hasOutputShape': False + }, + "weight": weight, + "bias": bias + }, + 'defaultDimentionFormat': 'NHWC' + } + self.base_model['oplists'].insert(self.idx, op) + self.idx += 1 + self.base_model['tensorName'].append(output_name) + return output_index + + def build_binary(self, op_type, input_indexes, output_name): + # 0: Add, 2: Mul + output_index = len(self.base_model['tensorName']) + op = { + "type": "BinaryOp", + "name": output_name, + "inputIndexes": input_indexes, + "outputIndexes": [ output_index ], + "main_type": "BinaryOp", + "main": { "opType": 0, "T": "DT_FLOAT", "activationType": 0 }, + "defaultDimentionFormat": "NHWC" + } + self.base_model['oplists'].insert(self.idx, op) + self.idx += 1 + self.base_model['tensorName'].append(output_name) + return output_index + + def replace_input(self, origin_idx, new_idx): + for op in self.base_model['oplists']: + if op['type'] == 'ConvertTensor' and origin_idx in op['inputIndexes']: + op['inputIndexes'] = [new_idx] + + def apply_lora(self, op, lora): + names = op['name'].split('/') + mul_scale = lora.scale + tag = names[1].split('.')[1] + names[3] + lora_a, lora_b = lora.get_lora(tag) + input_index = op['inputIndexes'][0] + outpt_index = op['outputIndexes'][0] + if self.fuse_lora: + w = (lora_a @ lora_b) + weight = w.reshape(-1).tolist() + b_out = self.build_conv(input_index, f'{tag}_B', w.shape, weight, mul_scale) + n_out = self.build_binary(0, [outpt_index, b_out], f'{tag}_add') + self.replace_input(outpt_index, n_out) + return + # lora_B @ lora_A @ x -> lora_B @ (lora_A @ x) + a_out = self.build_conv(input_index, f'{tag}_A', list(lora_a.shape), lora_a.flatten().tolist()) + b_out = self.build_conv(a_out, f'{tag}_B', list(lora_b.shape), lora_b.flatten().tolist(), mul_scale) + n_out = self.build_binary(0, [outpt_index, b_out], f'{tag}_add') + self.replace_input(outpt_index, n_out) + + def apply(self, lora, out): + ops = [] + for i in range(len(self.base_model['oplists'])): + op = self.base_model['oplists'][i] + if op['type'] == 'Convolution': + if lora.has_lora(op['name']): + self.idx = i + 1 + self.apply_lora(op, lora) + with open(out, 'w', encoding='utf-8') as file: + json.dump(self.base_model, file, ensure_ascii=False, indent=4) + +class LoRA: + def __init__(self, path, scale): + self.lora_A = {} + self.lora_B = {} + self.lora_keys = set() + self.scale = scale + self.load(path) + + def __str__(self): + return str(self.lora_keys) + + def has_lora(self, op_name): + if op_name[0] != '/': + return False + for key in self.lora_keys: + if key in op_name: + return True + return False + + def get_lora(self, tag): + lora_a, lora_b = self.lora_A[tag], self.lora_B[tag] + return lora_a, lora_b + + def load(self, path): + if os.path.isdir(path): + base_dir = path + config = json.load(open(os.path.join(base_dir, 'adapter_config.json'), 'rt')) + lora_alpha = config['lora_alpha'] + r = config['r'] + self.scale = float(lora_alpha) / r + print(self.scale) + path = os.path.join(base_dir, 'adapter_model.safetensors') + from safetensors import safe_open + with safe_open(path, framework="pt") as f: + for k in f.keys(): + names = k.split('.') + layer, key, name = names[4], names[6], names[7] + tag = layer + key + tensor = f.get_tensor(k).float() + self.lora_keys.add(key) + if 'lora_A' == name: + self.lora_A[tag] = tensor + else: + self.lora_B[tag] = tensor + +def main(args): + base = Base(args.base, args.fuse) + lora = LoRA(args.lora, args.scale) + base.apply(lora, args.out) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='apply_lora', formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--base', type=str, required=True, help='base model json path.') + parser.add_argument('--lora', type=str, required=True, help='lora dir path or *.safetensors path.') + parser.add_argument('--scale', type=float, default=4.0, help='lora scale: `alpha/r`.') + parser.add_argument('--fuse', type=bool, default=False, help='fuse A and B.') + parser.add_argument('--out', type=str, default='lora.json', help='out file name.') + args = parser.parse_args() + main(args) diff --git a/tools/script/arm_assembly.py b/tools/script/arm_assembly.py index 0a449a263..3b53853a7 100644 --- a/tools/script/arm_assembly.py +++ b/tools/script/arm_assembly.py @@ -5,7 +5,8 @@ def __init__(self, src_path, dst_path): self.src_path = src_path self.dst_path = dst_path # instructions - self.ops = ['sdot', 'smmla', 'bfmmla'] + self.ops = ['sdot', 'smmla', 'bfmmla', 'mov'] + def assembly(self): self.dst_content = [] src = open(self.src_path, 'rt') @@ -14,36 +15,46 @@ def assembly(self): cmd = code.strip().split(' ') for op in self.ops: if cmd[0] == op: - inst = getattr(self, op)(cmd[1], cmd[2], cmd[3]) - code = code[:code.find(op)] + inst + ' // ' + code.strip(' ') + if op == 'mov': + code = getattr(self, op)(code, cmd[1], cmd[2]) + else: + inst = getattr(self, op)(cmd[1], cmd[2], cmd[3]) + code = code[:code.find(op)] + inst + ' // ' + code.strip(' ') self.dst_content.append(code) src.close() self.write() + def write(self): dst = open(self.dst_path, 'wt') dst.writelines(self.dst_content) dst.close() + # asm parse helper function def gen_inst(self, opcode, flag, r1, r2, r3): cmd = opcode + r1 + flag + r2 + r3 inst = '.inst ' + str(hex(int(cmd, 2))) return inst + def register_to_bin(self, register): assert(register[0] == 'v') id = str(bin(int(register[1:])))[2:] id = '0' * (5 - len(id)) + id return id + def operand_spilt(self, operand): v, t = operand.split('.') return self.register_to_bin(v), t + def operand_to_bin(self, operand): r, _ = self.operand_spilt(operand) return r + def t_split(self, t): idx = None if t[-1] == ']': t, offset = t[:-1].split('[') return t, int(offset) + # instruction code gen function def sdot(self, operand1, operand2, operand3): # SDOT ., ., .[offset] @@ -74,7 +85,6 @@ def sdot(self, operand1, operand2, operand3): # set Q if "2s" in Ta and "8b" in Tb: opcode[1] = '0' - opcode = ''.join(opcode) flag = ''.join(flag) return self.gen_inst(opcode, flag, Vm, Vn, Vd) @@ -87,6 +97,7 @@ def smmla(self, operand1, operand2, operand3): Vn = self.operand_to_bin(operand2) Vm = self.operand_to_bin(operand3) return self.gen_inst(opcode, flag, Vm, Vn, Vd) + def bfmmla(self, operand1, operand2, operand3): # BFMMLA .4S, .8H, .8H opcode = '01101110010' @@ -96,6 +107,17 @@ def bfmmla(self, operand1, operand2, operand3): Vm = self.operand_to_bin(operand3) return self.gen_inst(opcode, flag, Vm, Vn, Vd) + def mov(self, code, operand1, operand2): + # compile failed using `mov v1.8h, v2.8h` + # change to `mov v1.16b, v2.16b` + if '.8h' not in operand1 or '.8h' not in operand2: + return code + operand1 = operand1.replace('8h', '16b') + operand2 = operand2.replace('8h', '16b') + new_mov = f'mov {operand1} {operand2}' + new_code = code[:code.find('mov')] + new_mov + ' // ' + code.strip(' ') + return new_code + if __name__ == '__main__': if len(sys.argv) < 2: print('Usage: python arm_asselmbly.py src.asm [dst.asm]') diff --git a/tools/script/convertOnnxTest.py b/tools/script/convertOnnxTest.py index c409a8ebc..f0ace5aa9 100755 --- a/tools/script/convertOnnxTest.py +++ b/tools/script/convertOnnxTest.py @@ -35,5 +35,6 @@ def run_cmd(args): for w in gWrong: print(w) print('TEST_NAME_MODULE: 模型测试\nTEST_CASE_AMOUNT_MODULE: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) +print('TEST_CASE={\"name\":\"Onnx转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/script/convertTfTest.py b/tools/script/convertTfTest.py index 2c8811eab..e178409c9 100755 --- a/tools/script/convertTfTest.py +++ b/tools/script/convertTfTest.py @@ -34,5 +34,6 @@ def run_cmd(args): for w in gWrong: print(w) print('TEST_NAME_TF: TFConvert测试\nTEST_CASE_AMOUNT_TF: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) +print('TEST_CASE={\"name\":\"Tensorflow转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/script/convertTfliteTest.py b/tools/script/convertTfliteTest.py index f25257c1c..8486b2a2f 100755 --- a/tools/script/convertTfliteTest.py +++ b/tools/script/convertTfliteTest.py @@ -33,5 +33,6 @@ def run_cmd(args): for w in gWrong: print(w) print('TEST_NAME_TFLITE: TFLITEConvert测试\nTEST_CASE_AMOUNT_TFLITE: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) +print('TEST_CASE={\"name\":\"Tflite转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/script/convertTorchTest.py b/tools/script/convertTorchTest.py index eb3450b4d..56a97bfe6 100755 --- a/tools/script/convertTorchTest.py +++ b/tools/script/convertTorchTest.py @@ -33,5 +33,6 @@ def run_cmd(args): for w in gWrong: print(w) print('TEST_NAME_TORCH: TORCHConvert测试\nTEST_CASE_AMOUNT_TORCH: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) +print('TEST_CASE={\"name\":\"TorchScript转换测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py index bb061f40f..056fa7fef 100755 --- a/tools/script/modelTest.py +++ b/tools/script/modelTest.py @@ -186,6 +186,7 @@ def run_cmd(args): if runStatic: flag = 'STATIC' print('TEST_NAME_MODEL%s: 模型测试%s\nTEST_CASE_AMOUNT_MODEL%s: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(flag, flag, flag, len(gWrong), total_num - len(gWrong))) +print('TEST_CASE={\"name\":\"模型测试%s\",\"failed\":%d,\"passed\":%d}\n'%(flag, len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/script/testPTQ.py b/tools/script/testPTQ.py index a43050a05..f88e9706a 100755 --- a/tools/script/testPTQ.py +++ b/tools/script/testPTQ.py @@ -25,7 +25,7 @@ def parseRes(res): point = float(item[splitIdx+1:]) idxs.add(idx) avgp += point - avgp /= len(items) + avgp /= len(items) return idxs, avgp def compare(origin, quant, jsonFile): @@ -38,7 +38,7 @@ def compare(origin, quant, jsonFile): quantIdx, quantPoint = parseRes(quant_res) print(originIdx, originPoint) print(quantIdx, quantPoint) - idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx)) + idxRate = len(originIdx & quantIdx) / max(len(originIdx), len(quantIdx)) pointRate = quantPoint / originPoint print(name, idxRate, pointRate) if idxRate < 0.5: @@ -94,6 +94,10 @@ def testacc(modelpath, imagepath, path, labelpath): with open(jsonFile) as f: jsonObj = json.loads(f.read()) originModel = modelpath + jsonObj['model'] + jsonObj['path'] = imagepath + jsonFile = './__quantized.json' + with open(jsonFile, 'w', encoding='utf-8') as fp: + json.dump(jsonObj, fp, ensure_ascii=False, indent=4) quantModel = './__quantModel.mnn' message = run_cmd(['./quantized.out', originModel, quantModel, jsonFile]) res = True @@ -110,7 +114,7 @@ def testacc(modelpath, imagepath, path, labelpath): model_root_dir = sys.argv[1] root_dir = os.path.join(model_root_dir, 'TestPTQ') print('root: ' + root_dir + '\n') - + gWrong = [] for name in os.listdir(root_dir + '/json'): if '.DS_Store' in name: @@ -123,6 +127,7 @@ def testacc(modelpath, imagepath, path, labelpath): for w in gWrong: print(w) print('TEST_NAME_PTQ: PTQ测试\nTEST_CASE_AMOUNT_PTQ: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) + print('TEST_CASE={\"name\":\"PTQ测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) @@ -139,5 +144,6 @@ def testacc(modelpath, imagepath, path, labelpath): for w in gWrong: print(w) print('BATCH_TEST_NAME_PTQ: PTQ测试\nTEST_CASE_AMOUNT_PTQ: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n'%(len(gWrong), total_num - len(gWrong))) + print('TEST_CASE={\"name\":\"BATCH-PTQ测试\",\"failed\":%d,\"passed\":%d}\n'%(len(gWrong), total_num - len(gWrong))) if len(gWrong) > 0: exit(1) diff --git a/tools/train/register.py b/tools/train/register.py new file mode 100644 index 000000000..780da5c64 --- /dev/null +++ b/tools/train/register.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +import os + +def generateGradFile(rootDir): + geoDir = os.path.join(rootDir, "source", "grad") + regFile = os.path.join(geoDir, "GradOPRegister.cpp") + fileNames = os.listdir(geoDir) + print(fileNames) + if len(fileNames) <= 1: + # Error dirs + return + funcNames = [] + for fi in fileNames: + if ".cpp" not in fi: + continue + f = os.path.join(geoDir, fi) + if os.path.isdir(f): + continue + with open(f) as fileC: + c = fileC.read().split('\n') + c = list(filter(lambda l:l.find('REGISTER_GRAD')>=0, c)) + for l in c: + l = l.split('(')[1] + l = l.split(')')[0] + l = l.replace(' ', '') + l = l.split(',') + funcName = '___' + l[0] + '__' + l[1] + '__' + funcNames.append(funcName) + + with open(regFile, 'w') as f: + f.write('// This file is generated by Shell for ops register\n') + f.write('#include \"OpGrad.hpp\"\n') + f.write('namespace MNN {\n') + for l in funcNames: + f.write("extern void " + l + '();\n') + f.write('\n') + f.write('void registerGradOps() {\n') + for l in funcNames: + f.write(l+'();\n') + f.write("}\n}\n") + + +import sys +generateGradFile(sys.argv[1]) diff --git a/tools/train/source/demo/MobilenetV2Utils.cpp b/tools/train/source/demo/MobilenetV2Utils.cpp index 696f52bcc..53bafb50a 100644 --- a/tools/train/source/demo/MobilenetV2Utils.cpp +++ b/tools/train/source/demo/MobilenetV2Utils.cpp @@ -16,7 +16,7 @@ #include "DemoUnit.hpp" #include "NN.hpp" #include "SGD.hpp" -#define MNN_OPEN_TIME_TRACE +//#define MNN_OPEN_TIME_TRACE #include #include "ADAM.hpp" #include "LearningRateScheduler.hpp" @@ -31,23 +31,26 @@ using namespace MNN; using namespace MNN::Express; using namespace MNN::Train; -void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses, const int addToLabel, +void MobilenetV2Utils::train(MNNForwardType backend, int threadNumber, std::shared_ptr model, const int numClasses, const int addToLabel, std::string trainImagesFolder, std::string trainImagesTxt, - std::string testImagesFolder, std::string testImagesTxt, const int quantBits) { + std::string testImagesFolder, std::string testImagesTxt, const int quantBits, int size) { auto exe = Executor::getGlobalExecutor(); BackendConfig config; - exe->setGlobalExecutorConfig(MNN_FORWARD_USER_1, config, 2); - std::shared_ptr solver(new SGD(model)); + exe->setGlobalExecutorConfig(backend, config, threadNumber); + std::shared_ptr solver(new ADAM(model)); solver->setMomentum(0.9f); // solver->setMomentum2(0.99f); solver->setWeightDecay(0.00004f); auto converImagesToFormat = CV::RGB; - int resizeHeight = 224; - int resizeWidth = 224; - std::vector means = {127.5, 127.5, 127.5}; - std::vector scales = {1/127.5, 1/127.5, 1/127.5}; - std::vector cropFraction = {0.875, 0.875}; // center crop fraction for height and width + int resizeHeight = size; + int resizeWidth = size; + std::vector means = {127.5f, 127.5f, 127.5f}; + std::vector scales = {1/127.5f, 1/127.5f, 1/127.5f}; + std::vector cropFraction = {0.875f, 0.875f}; // center crop fraction for height and width + if (size == 32) { + cropFraction = {1.0f, 1.0f}; + } bool centerOrRandomCrop = false; // true for random crop std::shared_ptr datasetConfig(ImageDataset::ImageConfig::create(converImagesToFormat, resizeHeight, resizeWidth, scales, means,cropFraction, centerOrRandomCrop)); bool readAllImagesToMemory = false; @@ -70,7 +73,6 @@ void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses for (int epoch = 0; epoch < 50; ++epoch) { model->clearCache(); - exe->gc(Executor::FULL); { AUTOTIME; trainDataLoader->reset(); @@ -79,16 +81,13 @@ void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses AUTOTIME; auto trainData = trainDataLoader->next(); auto example = trainData[0]; - // Compute One-Hot auto newTarget = _OneHot(_Cast(_Squeeze(example.second[0] + _Scalar(addToLabel), {})), _Scalar(numClasses), _Scalar(1.0f), _Scalar(0.0f)); - - auto predict = model->forward(_Convert(example.first[0], NC4HW4)); + auto predict = _Convert( model->forward(_Convert(example.first[0], NC4HW4)), NCHW); auto loss = _CrossEntropy(predict, newTarget); - // float rate = LrScheduler::inv(0.0001, solver->currentStep(), 0.0001, 0.75); - float rate = 1e-5; + float rate = LrScheduler::inv(0.0001, solver->currentStep(), 0.0001, 0.75); solver->setLearningRate(rate); if (solver->currentStep() % 10 == 0) { std::cout << "train iteration: " << solver->currentStep(); @@ -96,6 +95,7 @@ void MobilenetV2Utils::train(std::shared_ptr model, const int numClasses std::cout << " lr: " << rate << std::endl; } solver->step(loss); + exe->gc(Executor::FULL); } } diff --git a/tools/train/source/demo/MobilenetV2Utils.hpp b/tools/train/source/demo/MobilenetV2Utils.hpp index 67cec7939..196ee2e1d 100644 --- a/tools/train/source/demo/MobilenetV2Utils.hpp +++ b/tools/train/source/demo/MobilenetV2Utils.hpp @@ -14,9 +14,9 @@ class MobilenetV2Utils { public: - static void train(std::shared_ptr model, const int numClasses, const int addToLabel, + static void train(MNNForwardType backend, int threadNumber, std::shared_ptr model, const int numClasses, const int addToLabel, std::string trainImagesFolder, std::string trainImagesTxt, - std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8); + std::string testImagesFolder, std::string testImagesTxt, const int quantBits = 8, int size = 224); }; #endif diff --git a/tools/train/source/demo/demoMain.cpp b/tools/train/source/demo/demoMain.cpp index 30c844e75..701bfef6f 100644 --- a/tools/train/source/demo/demoMain.cpp +++ b/tools/train/source/demo/demoMain.cpp @@ -10,7 +10,7 @@ #include "DemoUnit.hpp" #include int main(int argc, const char* argv[]) { -// ExecutorScope::Current()->setLazyComputeMode(MNN::Express::Executor::LAZY_CONTENT); + ExecutorScope::Current()->setLazyComputeMode(MNN::Express::Executor::LAZY_COMPUTE_ONCE); if (argc < 2) { MNN_ERROR("Usage: ./runTrainDemo.out CASENAME [ARGS]\n"); auto& list = DemoUnitSet::get()->list(); diff --git a/tools/train/source/demo/mobilenetV2Train.cpp b/tools/train/source/demo/mobilenetV2Train.cpp index 50fb137b1..a98170d6d 100644 --- a/tools/train/source/demo/mobilenetV2Train.cpp +++ b/tools/train/source/demo/mobilenetV2Train.cpp @@ -59,6 +59,13 @@ class MobilenetV2Transfer : public DemoUnit { << std::endl; return 0; } + MNNForwardType type = MNN_FORWARD_CPU; + if (argc >= 7) { + std::istringstream is(argv[6]); + int c; + is >> c; + type = (MNNForwardType)c; + } std::string trainImagesFolder = argv[2]; std::string trainImagesTxt = argv[3]; @@ -67,7 +74,7 @@ class MobilenetV2Transfer : public DemoUnit { std::shared_ptr model(new MobilenetV2TransferModule(argv[1])); - MobilenetV2Utils::train(model, 4, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); + MobilenetV2Utils::train(type, 4, model, 4, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); return 0; } @@ -80,6 +87,14 @@ class MobilenetV2Train : public DemoUnit { std::cout << "usage: ./runTrainDemo.out MobilenetV2Train path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" << std::endl; return 0; } + MNNForwardType type = MNN_FORWARD_CPU; + if (argc >= 6) { + std::istringstream is(argv[5]); + int c; + is >> c; + type = (MNNForwardType)c; + } + // global random number generator, should invoke before construct the model and dataset RandomGenerator::generator(17); @@ -90,51 +105,57 @@ class MobilenetV2Train : public DemoUnit { std::shared_ptr model(new MobilenetV2); - MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); + MobilenetV2Utils::train(type, 4, model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); return 0; } }; -class MobilenetV2PostTrain : public DemoUnit { +class CifarMobilenetV2Train : public DemoUnit { public: virtual int run(int argc, const char* argv[]) override { - if (argc < 6) { - std::cout << "usage: ./runTrainDemo.out MobilentV2PostTrain /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" - << std::endl; + if (argc < 5) { + std::cout << "usage: ./runTrainDemo.out CifarMobilenetV2Train path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" << std::endl; return 0; } - - std::string trainImagesFolder = argv[2]; - std::string trainImagesTxt = argv[3]; - std::string testImagesFolder = argv[4]; - std::string testImagesTxt = argv[5]; - - auto varMap = Variable::loadMap(argv[1]); - if (varMap.empty()) { - MNN_ERROR("Can not load model %s\n", argv[1]); - return 0; + MNNForwardType type = MNN_FORWARD_CPU; + if (argc >= 6) { + std::istringstream is(argv[5]); + int c; + is >> c; + type = (MNNForwardType)c; } - auto inputOutputs = Variable::getInputAndOutput(varMap); - auto inputs = Variable::mapToSequence(inputOutputs.first); - auto outputs = Variable::mapToSequence(inputOutputs.second); - std::shared_ptr model(NN::extract(inputs, outputs, true)); + // global random number generator, should invoke before construct the model and dataset + RandomGenerator::generator(17); + + std::string trainImagesFolder = argv[1]; + std::string trainImagesTxt = argv[2]; + std::string testImagesFolder = argv[3]; + std::string testImagesTxt = argv[4]; - MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); + std::shared_ptr model(new MobilenetV2(10, 1.0f, 8, false)); + MobilenetV2Utils::train(type, 4, model, 10, 0, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt, 0, 32); return 0; } }; -class MobilenetV2TrainQuant : public DemoUnit { +class MobilenetV2PostTrain : public DemoUnit { public: virtual int run(int argc, const char* argv[]) override { if (argc < 6) { - std::cout << "usage: ./runTrainDemo.out MobilentV2TrainQuant /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt [bits]" + std::cout << "usage: ./runTrainDemo.out MobilentV2PostTrain /path/to/mobilenetV2Model path/to/train/images/ path/to/train/image/txt path/to/test/images/ path/to/test/image/txt" << std::endl; return 0; } + MNNForwardType type = MNN_FORWARD_CPU; + if (argc >= 7) { + std::istringstream is(argv[6]); + int c; + is >> c; + type = (MNNForwardType)c; + } std::string trainImagesFolder = argv[2]; std::string trainImagesTxt = argv[3]; @@ -147,24 +168,12 @@ class MobilenetV2TrainQuant : public DemoUnit { return 0; } - int bits = 8; - if (argc > 6) { - std::istringstream is(argv[6]); - is >> bits; - } - if (1 > bits || bits > 8) { - MNN_ERROR("bits must be 2-8, use 8 default\n"); - bits = 8; - } - auto inputOutputs = Variable::getInputAndOutput(varMap); auto inputs = Variable::mapToSequence(inputOutputs.first); auto outputs = Variable::mapToSequence(inputOutputs.second); - std::shared_ptr model(NN::extract(inputs, outputs, true)); - NN::turnQuantize(model.get(), bits); - MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); + MobilenetV2Utils::train(type, 4, model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt); return 0; } @@ -173,4 +182,4 @@ class MobilenetV2TrainQuant : public DemoUnit { DemoUnitSetRegister(MobilenetV2Transfer, "MobilenetV2Transfer"); DemoUnitSetRegister(MobilenetV2Train, "MobilenetV2Train"); DemoUnitSetRegister(MobilenetV2PostTrain, "MobilenetV2PostTrain"); -DemoUnitSetRegister(MobilenetV2TrainQuant, "MobilenetV2TrainQuant"); +DemoUnitSetRegister(CifarMobilenetV2Train, "CifarMobilenetV2Train"); diff --git a/tools/train/source/grad/BinaryGrad.cpp b/tools/train/source/grad/BinaryGrad.cpp index 7ace5d6e3..7c54daa2b 100644 --- a/tools/train/source/grad/BinaryGrad.cpp +++ b/tools/train/source/grad/BinaryGrad.cpp @@ -9,8 +9,9 @@ #include "BinaryGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; using namespace MNN::Express; +namespace MNN { + class EltwiseGrad : public OpGrad { public: virtual std::vector onGrad(Express::EXPRP expr, @@ -193,10 +194,11 @@ class BinaryGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static BinaryGrad _c; OpGrad::insert((int)OpType_BinaryOp, &_c); static EltwiseGrad _d; OpGrad::insert((int)OpType_Eltwise, &_d); - return true; -}(); +} +REGISTER_GRAD(BinaryGrad, _create); +}; diff --git a/tools/train/source/grad/BroadcastToGrad.cpp b/tools/train/source/grad/BroadcastToGrad.cpp index df8f29a18..fb2828941 100644 --- a/tools/train/source/grad/BroadcastToGrad.cpp +++ b/tools/train/source/grad/BroadcastToGrad.cpp @@ -9,9 +9,8 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; using namespace MNN::Express; - +namespace MNN { class BroadcastToGrad : public OpGrad { public: virtual std::vector onGrad(Express::EXPRP expr, @@ -70,8 +69,9 @@ class BroadcastToGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static BroadcastToGrad _c; OpGrad::insert(OpType_BroadcastTo, &_c); - return true; -}(); +} +REGISTER_GRAD(BroadcastToGrad, _create); +}; diff --git a/tools/train/source/grad/ConcatGrad.cpp b/tools/train/source/grad/ConcatGrad.cpp index 09fe384a3..0e6db8200 100644 --- a/tools/train/source/grad/ConcatGrad.cpp +++ b/tools/train/source/grad/ConcatGrad.cpp @@ -9,9 +9,8 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; using namespace MNN::Express; - +namespace MNN { class ConcatGrad : public OpGrad { public: virtual std::vector onGrad(Express::EXPRP expr, @@ -34,8 +33,12 @@ class ConcatGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static ConcatGrad _c; OpGrad::insert((int)OpType_Concat, &_c); - return true; -}(); + +}; +REGISTER_GRAD(ConcatGrad, _create); + +} + diff --git a/tools/train/source/grad/ConvGrad.cpp b/tools/train/source/grad/ConvGrad.cpp index ddb09ee39..1727ebc7a 100644 --- a/tools/train/source/grad/ConvGrad.cpp +++ b/tools/train/source/grad/ConvGrad.cpp @@ -10,7 +10,7 @@ #include "core/Macro.h" using namespace std; using namespace MNN::Express; -using namespace MNN; +namespace MNN { class ConvGrad : public OpGrad { public: virtual std::vector onGrad(Express::EXPRP expr, @@ -54,7 +54,7 @@ class ConvGrad : public OpGrad { auto sH = conv2D->common->strideY; auto dW = conv2D->common->dilateX; auto dH = conv2D->common->dilateY; - + std::vector padding {0, 0, 0, 0}; int kernelWidthSize = dW * (kW - 1) + 1; int kernelHeightSize = dH * (kH - 1) + 1; @@ -80,7 +80,7 @@ class ConvGrad : public OpGrad { conv2D->common->inputCount = outputCount; conv2D->common->outputCount = inputCount; newOp->main.value = conv2D; - + auto expr = Expr::create(std::move(newOp), {outputDiff, inputs[1]}); res[0] = Variable::create(expr); auto resultShape = res[0]->getInfo(); @@ -136,7 +136,7 @@ class DeconvGrad : public OpGrad { conv2D->common->inputCount = outputCount; conv2D->common->outputCount = inputCount; newOp->main.value = conv2D; - + auto expr = Expr::create(std::move(newOp), {outputDiff, inputs[1]}); res[0] = Variable::create(expr); } @@ -161,12 +161,14 @@ class DeconvGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static ConvGrad _c; OpGrad::insert(OpType_Convolution, &_c); OpGrad::insert(OpType_ConvolutionDepthwise, &_c); static DeconvGrad _d; OpGrad::insert(OpType_Deconvolution, &_d); OpGrad::insert(OpType_DeconvolutionDepthwise, &_d); - return true; -}(); +}; + +REGISTER_GRAD(ConvGrad, _create); +}; diff --git a/tools/train/source/grad/GatherGrad.cpp b/tools/train/source/grad/GatherGrad.cpp index 4d68b7f17..d80f69827 100644 --- a/tools/train/source/grad/GatherGrad.cpp +++ b/tools/train/source/grad/GatherGrad.cpp @@ -8,7 +8,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class GatherGrad : public OpGrad { @@ -38,8 +38,12 @@ class GatherGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static GatherGrad _c; OpGrad::insert((int)OpType_GatherV2, &_c); - return true; -}(); + +} + +REGISTER_GRAD(GatherGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/GradOPRegister.cpp b/tools/train/source/grad/GradOPRegister.cpp new file mode 100644 index 000000000..a8ddf5f6b --- /dev/null +++ b/tools/train/source/grad/GradOPRegister.cpp @@ -0,0 +1,65 @@ +// This file is generated by Shell for ops register +#include "OpGrad.hpp" +namespace MNN { +extern void ___TopKV2Grad_cpp___create__(); +extern void ___LoopGrad_cpp___create__(); +extern void ___SoftmaxGrad_cpp___create__(); +extern void ___GridSampleGrad_cpp___create__(); +extern void ___ReshapeGrad_cpp___create__(); +extern void ___ReluGrad_cpp___create__(); +extern void ___PoolGrad_cpp___create__(); +extern void ___GatherGrad_cpp___create__(); +extern void ___RoiPoolGrad_cpp___create__(); +extern void ___InterpGrad_cpp___create__(); +extern void ___RoiAlignGrad_cpp___create__(); +extern void ___MatMulGrad_cpp___create__(); +extern void ___RenderGrad_cpp___create__(); +extern void ___UnaryGrad_cpp___create__(); +extern void ___SeluGrad_cpp___create__(); +extern void ___SelectGrad_cpp___create__(); +extern void ___ZeroGrad_cpp___create__(); +extern void ___SliceGrad_cpp___create__(); +extern void ___ReduceGrad_cpp___create__(); +extern void ___ConcatGrad___create__(); +extern void ___BroadcastToGrad___create__(); +extern void ___BinaryGrad___create__(); +extern void ___TensorConvertGrad_cpp___create__(); +extern void ___RasterGrad_cpp___create__(); +extern void ___PermuteGrad_cpp___create__(); +extern void ___ConvGrad___create__(); +extern void ___StridedSliceGrad_cpp___create__(); +extern void ___MatrixBandPartGrad_cpp___create__(); +extern void ___ScaleGrad_cpp___create__(); + +void registerGradOps() { +___TopKV2Grad_cpp___create__(); +___LoopGrad_cpp___create__(); +___SoftmaxGrad_cpp___create__(); +___GridSampleGrad_cpp___create__(); +___ReshapeGrad_cpp___create__(); +___ReluGrad_cpp___create__(); +___PoolGrad_cpp___create__(); +___GatherGrad_cpp___create__(); +___RoiPoolGrad_cpp___create__(); +___InterpGrad_cpp___create__(); +___RoiAlignGrad_cpp___create__(); +___MatMulGrad_cpp___create__(); +___RenderGrad_cpp___create__(); +___UnaryGrad_cpp___create__(); +___SeluGrad_cpp___create__(); +___SelectGrad_cpp___create__(); +___ZeroGrad_cpp___create__(); +___SliceGrad_cpp___create__(); +___ReduceGrad_cpp___create__(); +___ConcatGrad___create__(); +___BroadcastToGrad___create__(); +___BinaryGrad___create__(); +___TensorConvertGrad_cpp___create__(); +___RasterGrad_cpp___create__(); +___PermuteGrad_cpp___create__(); +___ConvGrad___create__(); +___StridedSliceGrad_cpp___create__(); +___MatrixBandPartGrad_cpp___create__(); +___ScaleGrad_cpp___create__(); +} +} diff --git a/tools/train/source/grad/GridSampleGrad.cpp b/tools/train/source/grad/GridSampleGrad.cpp index 28ccb2fc3..9ee844e74 100644 --- a/tools/train/source/grad/GridSampleGrad.cpp +++ b/tools/train/source/grad/GridSampleGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class GridSampleGrad : public OpGrad { @@ -198,9 +198,13 @@ class GridSampleGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static GridSampleGrad _c; OpGrad::insert((int)OpType_GridSample, &_c); OpGrad::insert((int)OpType_Texture, &_c); - return true; -}(); + +} + +REGISTER_GRAD(GridSampleGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/InterpGrad.cpp b/tools/train/source/grad/InterpGrad.cpp index 8339164f9..451d485e7 100644 --- a/tools/train/source/grad/InterpGrad.cpp +++ b/tools/train/source/grad/InterpGrad.cpp @@ -12,7 +12,7 @@ #include using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; @@ -326,9 +326,13 @@ class InterpGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static InterpGrad _c; OpGrad::insert((int)OpType_Interp, &_c); OpGrad::insert((int)OpType_Resize, &_c); - return true; -}(); + +} + +REGISTER_GRAD(InterpGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/LoopGrad.cpp b/tools/train/source/grad/LoopGrad.cpp index 60363fb12..d3a514b65 100644 --- a/tools/train/source/grad/LoopGrad.cpp +++ b/tools/train/source/grad/LoopGrad.cpp @@ -8,7 +8,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class LoopGrad : public OpGrad { @@ -396,8 +396,12 @@ class LoopGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static LoopGrad _c; OpGrad::insert(OpType_While, &_c); - return true; -}(); + +} + +REGISTER_GRAD(LoopGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/MatMulGrad.cpp b/tools/train/source/grad/MatMulGrad.cpp index 3e93dc829..2d0e00958 100644 --- a/tools/train/source/grad/MatMulGrad.cpp +++ b/tools/train/source/grad/MatMulGrad.cpp @@ -8,7 +8,7 @@ #include "MatMulGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class BatchMatMulGrad : public OpGrad { public: @@ -221,10 +221,14 @@ class MatMulGrad : public OpGrad { return res; } }; -static const auto gRegister = []() { +static void _create() { static MatMulGrad _c; OpGrad::insert(OpType_MatMul, &_c); static BatchMatMulGrad _d; OpGrad::insert(OpType_BatchMatMul, &_d); - return true; -}(); + +} + +REGISTER_GRAD(MatMulGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/MatrixBandPartGrad.cpp b/tools/train/source/grad/MatrixBandPartGrad.cpp index a24fa6df6..29e53779d 100644 --- a/tools/train/source/grad/MatrixBandPartGrad.cpp +++ b/tools/train/source/grad/MatrixBandPartGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class MatrixBandPartGrad : public OpGrad { @@ -42,8 +42,12 @@ class MatrixBandPartGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static MatrixBandPartGrad _c; OpGrad::insert((int)OpType_MatrixBandPart, &_c); - return true; -}(); + +} + +REGISTER_GRAD(MatrixBandPartGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/OpGrad.cpp b/tools/train/source/grad/OpGrad.cpp index 824aac318..644c47d16 100644 --- a/tools/train/source/grad/OpGrad.cpp +++ b/tools/train/source/grad/OpGrad.cpp @@ -6,11 +6,13 @@ // Copyright © 2018, Alibaba Group Holding Limited // +#include #include "OpGrad.hpp" using namespace std; using namespace MNN::Express; //#define MNN_TRAIN_DEBUG namespace MNN { +extern void registerGradOps(); static std::map& getConverter() { static std::map gConverterMap; return gConverterMap; @@ -69,6 +71,12 @@ Express::VARP OpGrad::divideAvoidZero(MNN::Express::VARP y, MNN::Express::VARP x p = MNN::Express::_Maximum(p, MNN::Express::_Scalar(0.000001f)); return MNN::Express::_Divide(y, p) * sx; } +static std::once_flag gInit; +void OpGrad::init() { + std::call_once(gInit, []() { + registerGradOps(); + }); +} std::pair, std::vector> OpGrad::gradCommon(std::vector outputs, std::vector outputDiff, std::vector parameters) { if (outputs.size() != outputDiff.size()) { @@ -107,6 +115,7 @@ std::pair, std::vector> OpGrad::gradCo } std::map OpGrad::gradCommon(std::vector outputs, const std::set& parameters, std::map>& backwardMap, const std::vector blockName) { + init(); auto executeOrder = Variable::getExecuteOrder(outputs); for (auto iter = executeOrder.rbegin(); iter != executeOrder.rend(); iter++) { auto expr = *iter; diff --git a/tools/train/source/grad/OpGrad.hpp b/tools/train/source/grad/OpGrad.hpp index 16a0e3e99..6230198ce 100644 --- a/tools/train/source/grad/OpGrad.hpp +++ b/tools/train/source/grad/OpGrad.hpp @@ -26,6 +26,7 @@ class MNN_PUBLIC OpGrad { Type type() const { return mType; } + static void init(); static Express::VARP divideAvoidZero(MNN::Express::VARP y, MNN::Express::VARP x); virtual std::vector onGrad(Express::EXPRP expr, @@ -42,6 +43,11 @@ class MNN_PUBLIC OpGrad { protected: Type mType = LINEAR; }; +#define REGISTER_GRAD(f, c) \ + extern void ___##f##__##c##__() { \ + c(); \ + } + } // namespace MNN #endif diff --git a/tools/train/source/grad/PermuteGrad.cpp b/tools/train/source/grad/PermuteGrad.cpp index f08e7a497..79726ebde 100644 --- a/tools/train/source/grad/PermuteGrad.cpp +++ b/tools/train/source/grad/PermuteGrad.cpp @@ -8,7 +8,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class TransposeGrad : public OpGrad { @@ -67,10 +67,14 @@ class PermuteGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static PermuteGrad _c; OpGrad::insert((int)OpType_Permute, &_c); static TransposeGrad _d; OpGrad::insert((int)OpType_Transpose, &_d); - return true; -}(); + +} + +REGISTER_GRAD(PermuteGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/PoolGrad.cpp b/tools/train/source/grad/PoolGrad.cpp index 829b959f8..57755f9a1 100644 --- a/tools/train/source/grad/PoolGrad.cpp +++ b/tools/train/source/grad/PoolGrad.cpp @@ -9,7 +9,7 @@ #include "PoolGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class PoolGrad : public OpGrad { @@ -35,8 +35,12 @@ class PoolGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static PoolGrad _c; OpGrad::insert(OpType_Pooling, &_c); - return true; -}(); + +} + +REGISTER_GRAD(PoolGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/RasterGrad.cpp b/tools/train/source/grad/RasterGrad.cpp index bd4e75596..4d867ca75 100644 --- a/tools/train/source/grad/RasterGrad.cpp +++ b/tools/train/source/grad/RasterGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/TensorUtils.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class RasterGrad : public OpGrad { @@ -70,8 +70,12 @@ class RasterGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static RasterGrad _c; OpGrad::insert(OpType_Raster, &_c); - return true; -}(); + +} + +REGISTER_GRAD(RasterGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/ReduceGrad.cpp b/tools/train/source/grad/ReduceGrad.cpp index a4b2e1fce..a4932ee61 100644 --- a/tools/train/source/grad/ReduceGrad.cpp +++ b/tools/train/source/grad/ReduceGrad.cpp @@ -8,7 +8,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class ReduceGrad : public OpGrad { @@ -93,10 +93,14 @@ class FillGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static ReduceGrad _c; OpGrad::insert(OpType_Reduction, &_c); static FillGrad _d; OpGrad::insert(OpType_Fill, &_d); - return true; -}(); + +} + +REGISTER_GRAD(ReduceGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/ReluGrad.cpp b/tools/train/source/grad/ReluGrad.cpp index f6779a836..b836fbbf2 100644 --- a/tools/train/source/grad/ReluGrad.cpp +++ b/tools/train/source/grad/ReluGrad.cpp @@ -10,7 +10,7 @@ #include "core/Macro.h" #include using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class PReluGrad : public OpGrad { public: @@ -83,12 +83,16 @@ class Relu6Grad : public OpGrad { return result; } }; -static const auto gRegister = []() { +static void _create() { static ReluGrad _c; OpGrad::insert(OpType_ReLU, &_c); static Relu6Grad _d; OpGrad::insert(OpType_ReLU6, &_d); static PReluGrad _e; OpGrad::insert(OpType_PReLU, &_e); - return true; -}(); + +} + +REGISTER_GRAD(ReluGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/RenderGrad.cpp b/tools/train/source/grad/RenderGrad.cpp index d663a593f..57ebe14df 100644 --- a/tools/train/source/grad/RenderGrad.cpp +++ b/tools/train/source/grad/RenderGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class RasterDiffGrad : public OpGrad { @@ -26,8 +26,12 @@ class RasterDiffGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static RasterDiffGrad _c; OpGrad::insert(OpType_RasterDiff, &_c); - return true; -}(); + +} + +REGISTER_GRAD(RenderGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/ReshapeGrad.cpp b/tools/train/source/grad/ReshapeGrad.cpp index 3d886717e..ca1d59d54 100644 --- a/tools/train/source/grad/ReshapeGrad.cpp +++ b/tools/train/source/grad/ReshapeGrad.cpp @@ -9,7 +9,7 @@ #include "ReshapeGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class ReshapeGrad : public OpGrad { @@ -40,10 +40,14 @@ class ReshapeGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static ReshapeGrad _c; OpGrad::insert(OpType_Reshape, &_c); OpGrad::insert(OpType_Squeeze, &_c); OpGrad::insert(OpType_Unsqueeze, &_c); - return true; -}(); + +} + +REGISTER_GRAD(ReshapeGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/RoiAlignGrad.cpp b/tools/train/source/grad/RoiAlignGrad.cpp index 4737f06be..4b5eee7be 100644 --- a/tools/train/source/grad/RoiAlignGrad.cpp +++ b/tools/train/source/grad/RoiAlignGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class RoiAlignGrad : public OpGrad { @@ -35,8 +35,12 @@ class RoiAlignGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static RoiAlignGrad _c; OpGrad::insert(OpType_ROIAlign, &_c); - return true; -}(); + +} + +REGISTER_GRAD(RoiAlignGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/RoiPoolGrad.cpp b/tools/train/source/grad/RoiPoolGrad.cpp index d1e9ead37..f8577fd56 100644 --- a/tools/train/source/grad/RoiPoolGrad.cpp +++ b/tools/train/source/grad/RoiPoolGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class RoiPoolGrad : public OpGrad { @@ -32,8 +32,12 @@ class RoiPoolGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static RoiPoolGrad _c; OpGrad::insert(OpType_ROIPooling, &_c); - return true; -}(); + +} + +REGISTER_GRAD(RoiPoolGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/ScaleGrad.cpp b/tools/train/source/grad/ScaleGrad.cpp index 748fe5f13..dd5f0dc51 100644 --- a/tools/train/source/grad/ScaleGrad.cpp +++ b/tools/train/source/grad/ScaleGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class ScaleGrad : public OpGrad { @@ -32,8 +32,12 @@ class ScaleGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static ScaleGrad _c; OpGrad::insert(OpType_Scale, &_c); - return true; -}(); + +} + +REGISTER_GRAD(ScaleGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/SelectGrad.cpp b/tools/train/source/grad/SelectGrad.cpp index 4d63a8647..77ae0f456 100644 --- a/tools/train/source/grad/SelectGrad.cpp +++ b/tools/train/source/grad/SelectGrad.cpp @@ -9,7 +9,7 @@ #include "SelectGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class SelectGrad : public OpGrad { @@ -36,8 +36,12 @@ class SelectGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static SelectGrad _c; OpGrad::insert(OpType_Select, &_c); - return true; -}(); + +} + +REGISTER_GRAD(SelectGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/SeluGrad.cpp b/tools/train/source/grad/SeluGrad.cpp index 45a930351..4fae06bd8 100644 --- a/tools/train/source/grad/SeluGrad.cpp +++ b/tools/train/source/grad/SeluGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class SeluGrad : public OpGrad { @@ -38,8 +38,12 @@ class SeluGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static SeluGrad _c; OpGrad::insert(OpType_Selu, &_c); - return true; -}(); + +} + +REGISTER_GRAD(SeluGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/SliceGrad.cpp b/tools/train/source/grad/SliceGrad.cpp index 6cc168413..3f1fbc7ad 100644 --- a/tools/train/source/grad/SliceGrad.cpp +++ b/tools/train/source/grad/SliceGrad.cpp @@ -8,7 +8,7 @@ #include "OpGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class SliceGrad : public OpGrad { @@ -39,8 +39,12 @@ class SliceGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static SliceGrad _c; OpGrad::insert((int)OpType_Slice, &_c); - return true; -}(); + +} + +REGISTER_GRAD(SliceGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/SoftmaxGrad.cpp b/tools/train/source/grad/SoftmaxGrad.cpp index 6e7d3ef78..6d144560c 100644 --- a/tools/train/source/grad/SoftmaxGrad.cpp +++ b/tools/train/source/grad/SoftmaxGrad.cpp @@ -10,7 +10,7 @@ #include "core/Macro.h" #include using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class SoftmaxGrad : public OpGrad { @@ -45,8 +45,12 @@ class SoftmaxGrad : public OpGrad { return {inputGrad}; } }; -static const auto gRegister = []() { +static void _create() { static SoftmaxGrad _c; OpGrad::insert(OpType_Softmax, &_c); - return true; -}(); + +} + +REGISTER_GRAD(SoftmaxGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/StridedSliceGrad.cpp b/tools/train/source/grad/StridedSliceGrad.cpp index 364567e09..90fc17612 100644 --- a/tools/train/source/grad/StridedSliceGrad.cpp +++ b/tools/train/source/grad/StridedSliceGrad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class StridedSliceGrad : public OpGrad { @@ -39,8 +39,12 @@ class StridedSliceGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static StridedSliceGrad _c; OpGrad::insert(OpType_StridedSlice, &_c); - return true; -}(); + +} + +REGISTER_GRAD(StridedSliceGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/TensorConvertGrad.cpp b/tools/train/source/grad/TensorConvertGrad.cpp index 3142aa610..ea0da919b 100644 --- a/tools/train/source/grad/TensorConvertGrad.cpp +++ b/tools/train/source/grad/TensorConvertGrad.cpp @@ -8,7 +8,7 @@ #include "TensorConvertGrad.hpp" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class TensorConvertGrad : public OpGrad { @@ -22,8 +22,12 @@ class TensorConvertGrad : public OpGrad { return result; } }; -static const auto gRegister = []() { +static void _create() { static TensorConvertGrad _c; OpGrad::insert(OpType_ConvertTensor, &_c); - return true; -}(); + +} + +REGISTER_GRAD(TensorConvertGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/TopKV2Grad.cpp b/tools/train/source/grad/TopKV2Grad.cpp index 65d4fadc7..b996dd5f4 100644 --- a/tools/train/source/grad/TopKV2Grad.cpp +++ b/tools/train/source/grad/TopKV2Grad.cpp @@ -9,7 +9,7 @@ #include "OpGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class TopKV2Grad : public OpGrad { @@ -30,8 +30,12 @@ class TopKV2Grad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static TopKV2Grad _c; OpGrad::insert(OpType_TopKV2, &_c); - return true; -}(); + +} + +REGISTER_GRAD(TopKV2Grad_cpp, _create); +}; + diff --git a/tools/train/source/grad/UnaryGrad.cpp b/tools/train/source/grad/UnaryGrad.cpp index f213e0c9e..4eb266a8c 100644 --- a/tools/train/source/grad/UnaryGrad.cpp +++ b/tools/train/source/grad/UnaryGrad.cpp @@ -12,7 +12,7 @@ #define MNN_PI 3.14159265358979323846 using namespace std; -using namespace MNN; +namespace MNN { using namespace MNN::Express; class UnaryGrad : public OpGrad { @@ -221,12 +221,16 @@ class TanhGrad : public OpGrad { } }; -static const auto gRegister = []() { +static void _create() { static UnaryGrad _c; static SigmoidGrad _s; static TanhGrad _t; OpGrad::insert(OpType_UnaryOp, &_c); OpGrad::insert(OpType_Sigmoid, &_s); OpGrad::insert(OpType_TanH, &_t); - return true; -}(); + +} + +REGISTER_GRAD(UnaryGrad_cpp, _create); +}; + diff --git a/tools/train/source/grad/ZeroGrad.cpp b/tools/train/source/grad/ZeroGrad.cpp index e77f1866d..5b0e251f1 100644 --- a/tools/train/source/grad/ZeroGrad.cpp +++ b/tools/train/source/grad/ZeroGrad.cpp @@ -9,7 +9,7 @@ #include "ReluGrad.hpp" #include "core/Macro.h" using namespace std; -using namespace MNN; +namespace MNN { class ZeroGrad : public OpGrad { public: @@ -23,8 +23,12 @@ class ZeroGrad : public OpGrad { return result; } }; -static const auto gRegister = []() { +static void _create() { static ZeroGrad _c; OpGrad::insert(OpType_ZeroGrad, &_c); - return true; -}(); + +} + +REGISTER_GRAD(ZeroGrad_cpp, _create); +}; + diff --git a/tools/train/source/models/MobilenetV2.cpp b/tools/train/source/models/MobilenetV2.cpp index fe3daef0c..370f45b62 100644 --- a/tools/train/source/models/MobilenetV2.cpp +++ b/tools/train/source/models/MobilenetV2.cpp @@ -15,7 +15,7 @@ namespace Model { using namespace MNN::Express; class _ConvBnRelu : public Module { public: - _ConvBnRelu(std::vector inputOutputChannels, int kernelSize = 3, int stride = 1, bool depthwise = false); + _ConvBnRelu(std::vector inputOutputChannels, int kernelSize = 3, int stride = 1, bool depthwise = false, bool useBn = true); virtual std::vector onForward(const std::vector &inputs) override; @@ -24,13 +24,13 @@ class _ConvBnRelu : public Module { }; std::shared_ptr ConvBnRelu(std::vector inputOutputChannels, int kernelSize = 3, int stride = 1, - bool depthwise = false) { - return std::shared_ptr(new _ConvBnRelu(inputOutputChannels, kernelSize, stride, depthwise)); + bool depthwise = false, bool useBn = true) { + return std::shared_ptr(new _ConvBnRelu(inputOutputChannels, kernelSize, stride, depthwise, useBn)); } class _BottleNeck : public Module { public: - _BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio); + _BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio, bool useBn = true); virtual std::vector onForward(const std::vector &inputs) override; @@ -38,11 +38,11 @@ class _BottleNeck : public Module { bool useShortcut = false; }; -std::shared_ptr BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio) { - return std::shared_ptr(new _BottleNeck(inputOutputChannels, stride, expandRatio)); +std::shared_ptr BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio, bool useBn) { + return std::shared_ptr(new _BottleNeck(inputOutputChannels, stride, expandRatio, useBn)); } -_ConvBnRelu::_ConvBnRelu(std::vector inputOutputChannels, int kernelSize, int stride, bool depthwise) { +_ConvBnRelu::_ConvBnRelu(std::vector inputOutputChannels, int kernelSize, int stride, bool depthwise, bool useBn) { int inputChannels = inputOutputChannels[0], outputChannels = inputOutputChannels[1]; NN::ConvOption convOption; @@ -53,9 +53,12 @@ _ConvBnRelu::_ConvBnRelu(std::vector inputOutputChannels, int kernelSize, i convOption.depthwise = depthwise; conv.reset(NN::Conv(convOption, false, std::shared_ptr(Initializer::MSRA()))); - bn.reset(NN::BatchNorm(outputChannels)); - - registerModel({conv, bn}); + if (useBn) { + bn.reset(NN::BatchNorm(outputChannels)); + registerModel({conv, bn}); + } else { + registerModel({conv}); + } } std::vector _ConvBnRelu::onForward(const std::vector &inputs) { @@ -63,13 +66,15 @@ std::vector _ConvBnRelu::onForward(const std::vectorforward(x); - x = bn->forward(x); + if (nullptr != bn.get()) { + x = bn->forward(x); + } x = _Relu6(x); return {x}; } -_BottleNeck::_BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio) { +_BottleNeck::_BottleNeck(std::vector inputOutputChannels, int stride, int expandRatio, bool useBn) { int inputChannels = inputOutputChannels[0], outputChannels = inputOutputChannels[1]; int expandChannels = inputChannels * expandRatio; @@ -78,10 +83,10 @@ _BottleNeck::_BottleNeck(std::vector inputOutputChannels, int stride, int e } if (expandRatio != 1) { - layers.emplace_back(ConvBnRelu({inputChannels, expandChannels}, 1)); + layers.emplace_back(ConvBnRelu({inputChannels, expandChannels}, 1, 1, false, useBn)); } - layers.emplace_back(ConvBnRelu({expandChannels, expandChannels}, 3, stride, true)); + layers.emplace_back(ConvBnRelu({expandChannels, expandChannels}, 3, stride, true, useBn)); NN::ConvOption convOption; convOption.kernelSize = {1, 1}; @@ -91,7 +96,9 @@ _BottleNeck::_BottleNeck(std::vector inputOutputChannels, int stride, int e convOption.depthwise = false; layers.emplace_back(NN::Conv(convOption, false, std::shared_ptr(Initializer::MSRA()))); - layers.emplace_back(NN::BatchNorm(outputChannels)); + if (useBn) { + layers.emplace_back(NN::BatchNorm(outputChannels)); + } registerModel(layers); } @@ -111,7 +118,7 @@ std::vector _BottleNeck::onForward(const std::vector setting = invertedResidualSetting[i]; @@ -144,12 +151,12 @@ MobilenetV2::MobilenetV2(int numClasses, float widthMult, int divisor) { stride = s; } - bottleNeckBlocks.emplace_back(BottleNeck({inputChannels, outputChannels}, stride, t)); + bottleNeckBlocks.emplace_back(BottleNeck({inputChannels, outputChannels}, stride, t, useBn)); inputChannels = outputChannels; } } - lastConv = ConvBnRelu({inputChannels, lastChannels}, 1); + lastConv = ConvBnRelu({inputChannels, lastChannels}, 1, 1, false, useBn); dropout.reset(NN::Dropout(0.1)); fc.reset(NN::Linear(lastChannels, numClasses, true, std::shared_ptr(Initializer::MSRA()))); diff --git a/tools/train/source/models/MobilenetV2.hpp b/tools/train/source/models/MobilenetV2.hpp index 88e95e749..c5c4c9906 100644 --- a/tools/train/source/models/MobilenetV2.hpp +++ b/tools/train/source/models/MobilenetV2.hpp @@ -24,7 +24,7 @@ class MNN_PUBLIC MobilenetV2 : public Express::Module { public: // use tensorflow numClasses = 1001, which label 0 means outlier of the original 1000 classes // so you maybe need to add 1 to your true labels, if you are testing with ImageNet dataset - MobilenetV2(int numClasses = 1001, float widthMult = 1.0f, int divisor = 8); + MobilenetV2(int numClasses = 1001, float widthMult = 1.0f, int divisor = 8, bool useBn = true); virtual std::vector onForward(const std::vector &inputs) override; diff --git a/transformers/diffusion/main.cpp b/transformers/diffusion/main.cpp index 946175e34..75abd8924 100644 --- a/transformers/diffusion/main.cpp +++ b/transformers/diffusion/main.cpp @@ -2,36 +2,46 @@ #include "pipeline.hpp" int main(int argc, const char* argv[]) { - if (argc < 3) { - MNN_PRINT("Usage: ./diffusion_demo \n"); + if (argc < 7) { + MNN_PRINT("=====================================================================================================================\n"); + MNN_PRINT("Usage: ./diffusion_demo \n"); + MNN_PRINT("=====================================================================================================================\n"); return 0; } auto resource_path = argv[1]; auto model_type = (diffusion::DiffusionModelType)atoi(argv[2]); auto img_name = argv[3]; - + auto memory_mode = atoi(argv[4]); + auto backend_type = (MNNForwardType)atoi(argv[5]); std::string input_text; - for (int i = 4; i < argc; ++i) { + for (int i = 6; i < argc; ++i) { input_text += argv[i]; if (i < argc - 1) { input_text += " "; } } - MNN_PRINT("model resource path: %s\n", resource_path); + MNN_PRINT("Model resource path: %s\n", resource_path); if(model_type == diffusion::STABLE_DIFFUSION_1_5) { - MNN_PRINT("model type is stable diffusion 1.5\n"); + MNN_PRINT("Model type is stable diffusion 1.5\n"); } else if (model_type == diffusion::STABLE_DIFFUSION_TAIYI_CHINESE) { - MNN_PRINT("model type is stable diffusion taiyi chinese version\n"); + MNN_PRINT("Model type is stable diffusion taiyi chinese version\n"); + } else { + MNN_PRINT("Error: Model type %d not supported, please check\n", (int)model_type); + } + + if(memory_mode == 0) { + MNN_PRINT("(Memory Lack) Each diffusion model will be initilized when using, freed after using.\n"); } else { - MNN_PRINT("model type: %d not supported, please check\n", (int)model_type); + MNN_PRINT("(Memory Enough) All Diffusion models will be initilized when application enter.\n"); } - MNN_PRINT("output img_name: %s\n", img_name); - MNN_PRINT("input texts: %s\n", input_text.c_str()); + MNN_PRINT("Backend type: %d\n", (int)backend_type); + MNN_PRINT("Output image name: %s\n", img_name); + MNN_PRINT("Prompt text: %s\n", input_text.c_str()); - diffusion::Pipeline pipeline(resource_path, model_type); + diffusion::Pipeline pipeline(resource_path, model_type, backend_type, memory_mode); pipeline.run(input_text, img_name); return 0; } diff --git a/transformers/diffusion/pipeline.cpp b/transformers/diffusion/pipeline.cpp index 194ebebd2..ed35ba705 100644 --- a/transformers/diffusion/pipeline.cpp +++ b/transformers/diffusion/pipeline.cpp @@ -24,23 +24,6 @@ using namespace CV; namespace diffusion { -static inline int64_t getTime() { - uint64_t time; -#if defined(_MSC_VER) - LARGE_INTEGER now, freq; - QueryPerformanceCounter(&now); - QueryPerformanceFrequency(&freq); - uint64_t sec = now.QuadPart / freq.QuadPart; - uint64_t usec = (now.QuadPart % freq.QuadPart) * 1000000 / freq.QuadPart; - time = sec * 1000000 + usec; -#else - struct timeval tv; - gettimeofday(&tv, nullptr); - time = static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; -#endif - return time; -} - void display_progress(int cur, int total){ putchar('\r'); MNN_PRINT("["); @@ -52,7 +35,8 @@ void display_progress(int cur, int total){ fflush(stdout); } -Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType) : mModelPath(modelPath), mModelType(modelType) { +Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode) : + mModelPath(modelPath), mModelType(modelType), mBackendType(backendType), mMemoryMode(memoryMode) { if(modelType == STABLE_DIFFUSION_1_5) { mMaxTextLen = 77; } else if(modelType == diffusion::STABLE_DIFFUSION_TAIYI_CHINESE) { @@ -86,17 +70,26 @@ Pipeline::Pipeline(std::string modelPath, DiffusionModelType modelType) : mModel }; } -bool Pipeline::load_modules(std::string modelPath) { +bool Pipeline::load_modules() { AUTOTIME; ScheduleConfig config; BackendConfig backendConfig; -// config.type = MNN_FORWARD_CPU; - config.type = MNN_FORWARD_OPENCL; - config.mode = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_FAST; + config.type = mBackendType; + if(config.type == MNN_FORWARD_CPU) { + backendConfig.memory = BackendConfig::Memory_Low; + config.numThread = 4; + } else if(config.type == MNN_FORWARD_OPENCL) { + config.mode = MNN_GPU_MEMORY_BUFFER | MNN_GPU_TUNING_FAST; + } else { + config.numThread = 1; + } + backendConfig.precision = BackendConfig::Precision_Low; - backendConfig.memory = BackendConfig::Memory_Low; config.backendConfig = &backendConfig; + auto exe = ExecutorScope::Current(); + exe->lazyEval = false; + exe->setGlobalExecutorConfig(config.type, backendConfig, config.numThread); Module::Config module_config; module_config.shapeMutable = false; @@ -113,48 +106,47 @@ bool Pipeline::load_modules(std::string modelPath) { mTimestepVar = _Input({1}, NCHW, halide_type_of()); mSampleVar = _Concat({mLatentVar, mLatentVar}, 0); - MNN_PRINT("Model loading and initilizing...\n"); MNN_PRINT("First time initilizing may cost a few seconds to create cachefile, please wait ...\n"); VARP text_embeddings; mModules.resize(3); // load text_encoder model { - std::string model_path = modelPath + "/text_encoder.mnn"; + std::string model_path = mModelPath + "/text_encoder.mnn"; mModules[0].reset(Module::load( {"input_ids"}, {"last_hidden_state", "pooler_output"}, model_path.c_str(), runtime_manager_, &module_config)); - auto outputs = mModules[0]->onForward({mPromptVar}); - text_embeddings = _Convert(outputs[0], NCHW); - + if(mMemoryMode > 0) { + auto outputs = mModules[0]->onForward({mPromptVar}); + text_embeddings = _Convert(outputs[0], NCHW); + } display_progress(1, 3); } // load unet model { - std::string model_path = modelPath + "/unet.mnn"; + std::string model_path = mModelPath + "/unet.mnn"; mModules[1].reset(Module::load( {"sample", "timestep", "encoder_hidden_states"}, {"out_sample"}, model_path.c_str(), runtime_manager_, &module_config)); - auto outputs = mModules[1]->onForward({mSampleVar, mTimestepVar, text_embeddings}); - - auto output = _Convert(outputs[0], NCHW); + if(mMemoryMode > 0) { + auto outputs = mModules[1]->onForward({mSampleVar, mTimestepVar, text_embeddings}); + auto output = _Convert(outputs[0], NCHW); + } display_progress(2, 3); } // load vae_decoder model { - std::string model_path = modelPath + "/vae_decoder.mnn"; + std::string model_path = mModelPath + "/vae_decoder.mnn"; mModules[2].reset(Module::load( {"latent_sample"}, {"sample"}, model_path.c_str(), runtime_manager_, &module_config)); - + + if(mMemoryMode > 0) { auto outputs = mModules[2]->onForward({mLatentVar}); auto output = _Convert(outputs[0], NCHW); - display_progress(3, 3); + } + display_progress(3, 3); } - auto exe = ExecutorScope::Current(); - exe->lazyEval = false; - exe->setGlobalExecutorConfig(config.type, backendConfig, config.numThread); - return true; } @@ -321,7 +313,7 @@ VARP Pipeline::vae_decoder(VARP latent) { return image; } -bool Pipeline::run(const std::string& sentence, const std::string& img_name) { +bool Pipeline::run(const std::string& prompt, const std::string& imagePath) { std::unique_ptr tok; if(mModelType == STABLE_DIFFUSION_1_5) { tok.reset(new diffusion::CLIPTokenizer); @@ -329,18 +321,18 @@ bool Pipeline::run(const std::string& sentence, const std::string& img_name) { tok.reset(new diffusion::BertTokenizer); } tok->load(mModelPath); - load_modules(mModelPath); + load_modules(); AUTOTIME; - auto ids = tok->encode(sentence, mMaxTextLen); + auto ids = tok->encode(prompt, mMaxTextLen); auto text_embeddings = text_encoder(ids); auto latent = unet(text_embeddings); auto image = vae_decoder(latent); - bool res = imwrite(img_name, image); + bool res = imwrite(imagePath, image); if (res) { - MNN_PRINT("SUCCESS! write to %s\n", img_name.c_str()); + MNN_PRINT("SUCCESS! write generated image to %s\n", imagePath.c_str()); } return true; } diff --git a/transformers/diffusion/pipeline.hpp b/transformers/diffusion/pipeline.hpp index 2d8cc2811..3ce1e95f0 100644 --- a/transformers/diffusion/pipeline.hpp +++ b/transformers/diffusion/pipeline.hpp @@ -19,11 +19,11 @@ typedef enum { class Pipeline { public: - Pipeline(std::string modelPath, DiffusionModelType modelType); + Pipeline(std::string modelPath, DiffusionModelType modelType, MNNForwardType backendType, int memoryMode); ~Pipeline() = default; - bool run(const std::string& sentence, const std::string& img_name); + bool run(const std::string& prompt, const std::string& imagePath); private: - bool load_modules(std::string modelPath); + bool load_modules(); VARP step_plms(VARP sample, VARP model_output, int index); VARP text_encoder(const std::vector& ids); VARP unet(VARP text_embeddings); @@ -31,16 +31,19 @@ class Pipeline { private: std::shared_ptr runtime_manager_; std::vector> mModules; - - std::string mModelPath; - DiffusionModelType mModelType; - int mMaxTextLen = 77; // step_plms std::vector mTimeSteps; std::vector mAlphas; std::vector mEts; VARP mSample; VARP mLatentVar, mPromptVar, mTimestepVar, mSampleVar; + +private: + std::string mModelPath; + DiffusionModelType mModelType; + int mMaxTextLen = 77; + int mMemoryMode; + MNNForwardType mBackendType; }; } diff --git a/transformers/llm/config.json b/transformers/llm/config.json index 7025fad4b..f34f70063 100755 --- a/transformers/llm/config.json +++ b/transformers/llm/config.json @@ -5,5 +5,11 @@ "backend_type": "cpu", "thread_num": 4, "precision": "low", - "memory": "low" -} + "memory": "low", + + "is_batch_quant": 1, + + "reuse_kv": false, + "quant_kv": 0, + "kvcache_limit": -1 +} \ No newline at end of file diff --git a/transformers/llm/engine/CMakeLists.txt b/transformers/llm/engine/CMakeLists.txt index 767cc272e..2601e4d92 100644 --- a/transformers/llm/engine/CMakeLists.txt +++ b/transformers/llm/engine/CMakeLists.txt @@ -4,24 +4,22 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/include/) # source files FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp) -if (MSVC) - # compile static lib, surrpot Winwows - add_library(llm STATIC ${SRCS}) - target_link_libraries(llm ${MNN_DEPS}) -else() - if (MNN_SEP_BUILD) - if (MNN_BUILD_SHARED_LIBS) - # compile dynamic so, support Linux/Mac - add_library(llm SHARED ${SRCS}) - set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) - target_link_libraries(llm ${MNN_DEPS}) - else() - add_library(llm STATIC ${SRCS}) - endif() +if (MNN_SEP_BUILD) + if (MNN_BUILD_SHARED_LIBS) + # compile dynamic so, support Linux/Mac + add_library(llm SHARED ${SRCS}) + set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE) + target_link_libraries(llm ${MNN_DEPS}) else() - add_library(llm OBJECT ${SRCS}) + add_library(llm STATIC ${SRCS}) endif() +else() + add_library(llm OBJECT ${SRCS}) endif() add_executable(llm_demo ${CMAKE_CURRENT_LIST_DIR}/llm_demo.cpp) -target_link_libraries(llm_demo llm) \ No newline at end of file +IF (NOT MNN_SEP_BUILD) + target_link_libraries(llm_demo ${MNN_DEPS}) +ELSE () + target_link_libraries(llm_demo ${MNN_DEPS} llm) +ENDIF () \ No newline at end of file diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp index 4e1f445df..a4b868592 100644 --- a/transformers/llm/engine/include/llm/llm.hpp +++ b/transformers/llm/engine/include/llm/llm.hpp @@ -70,6 +70,11 @@ class MNN_PUBLIC Llm { // config function std::string dump_config(); bool set_config(const std::string& content); + // lora function + size_t apply_lora(const std::string& lora_path); + Llm* create_lora(const std::string& lora_path); + bool release_module(size_t index); + bool select_module(size_t index); friend class Pipeline; public: // forward info @@ -89,8 +94,8 @@ class MNN_PUBLIC Llm { MNN::Express::VARP inputs_embeds_, attention_mask_, position_ids_; std::shared_ptr runtime_manager_; std::vector> modules_; - std::vector> decode_modules_; - std::vector> prefill_modules_; + std::vector> prefill_modules_, decode_modules_, current_modules_; + const MNN::Express::Module* base_module_ = nullptr; void init_runtime(); std::string decode(int id); bool is_stop(int token_id); @@ -98,6 +103,8 @@ class MNN_PUBLIC Llm { virtual MNN::Express::VARP embedding(const std::vector& input_ids); virtual MNN::Express::VARP gen_attention_mask(int seq_len); virtual MNN::Express::VARP gen_position_ids(int seq_len); + bool mTracing = false; + }; // Embedding start diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp index 416154f84..3e41b2eb0 100644 --- a/transformers/llm/engine/llm_demo.cpp +++ b/transformers/llm/engine/llm_demo.cpp @@ -14,23 +14,9 @@ using namespace MNN::Transformer; static void trace_prepare(Llm* llm) { MNN_PRINT("Prepare for resize opt Begin\n"); - std::vector prompts = { - "Hello", - }; llm->trace(true); - int prompt_len = 0; - int decode_len = 0; - int64_t prefill_time = 0; - int64_t decode_time = 0; - // llm->warmup(); - for (int i = 0; i < prompts.size(); i++) { - std::ostringstream cacheOs; - llm->response(prompts[i], &cacheOs); - prompt_len += llm->prompt_len_; - decode_len += llm->gen_seq_len_; - prefill_time += llm->prefill_us_; - decode_time += llm->decode_us_; - } + std::ostringstream cacheOs; + llm->response("Hello", &cacheOs); MNN_PRINT("Prepare for resize opt End\n"); llm->trace(false); } @@ -181,7 +167,7 @@ int main(int argc, const char* argv[]) { AUTOTIME; llm->load(); } - if (true) { + if (false) { AUTOTIME; trace_prepare(llm.get()); } diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp index 4ed60d9c2..efa9a5d23 100644 --- a/transformers/llm/engine/src/llm.cpp +++ b/transformers/llm/engine/src/llm.cpp @@ -97,8 +97,10 @@ void Llm::init_runtime() { runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0); runtime_manager_->setHint(MNN::Interpreter::DYNAMIC_QUANT_OPTIONS, 1); // 1: per batch quant, 2: per tensor quant - runtime_manager_->setHint(MNN::Interpreter::KVCACHE_QUANT_OPTIONS, config_->quant_kv()); // 0: no quant, 1: quant key, 2: quant value, 3: quant kv - + runtime_manager_->setHint(MNN::Interpreter::KVCACHE_QUANT_OPTIONS, config_->quant_kv()); + runtime_manager_->setHint(MNN::Interpreter::KVCACHE_SIZE_LIMIT, config_->kvcache_limit()); + runtime_manager_->setExternalPath("/tmp/.kvcache", MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR); + #if DEBUG_MODE==1 runtime_manager_->setMode(MNN::Interpreter::Session_Debug); _initTimeTrace(); @@ -130,6 +132,10 @@ void Llm::load() { Module::Config module_config; module_config.shapeMutable = true; module_config.rearrange = true; + // using base module for lora module + if (base_module_ != nullptr) { + module_config.base = base_module_; + } int layer_nums = config_->layer_nums(); if (is_single_) { // load single model @@ -164,6 +170,54 @@ void Llm::load() { prefill_modules_ = modules_; } +size_t Llm::apply_lora(const std::string& lora_path) { + std::string model_path = config_->base_dir_ + "/" + lora_path; + Module::Config module_config; + module_config.shapeMutable = true; + module_config.rearrange = true; + module_config.base = modules_.begin()->get(); + size_t lora_index = modules_.size(); + modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"}, + {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + select_module(lora_index); + return lora_index; +} + +Llm* Llm::create_lora(const std::string& lora_path) { + auto llm = new Llm(config_); + llm->set_config("{\"llm_model\": \"" + lora_path + "\"}"); + llm->base_module_ = modules_.begin()->get(); + llm->load(); + return llm; +} + +bool Llm::release_module(size_t index) { + if (index >= modules_.size()) { + return false; + } + if (prefill_modules_[0] == modules_[index]) { + select_module(0); + } + modules_[index].reset(); + return true; +} + +bool Llm::select_module(size_t index) { + if (index >= modules_.size()) { + return false; + } + if (modules_[index] == nullptr) { + return false; + } + if (decode_modules_.empty()) { + decode_modules_.resize(modules_.size()); + prefill_modules_.resize(modules_.size()); + } + decode_modules_[0].reset(Module::clone(modules_[index].get())); + prefill_modules_[0] = modules_[index]; + return true; +} + void Llm::trace(bool start) { auto status = MNN::Interpreter::Session_Resize_Check; if (start) { @@ -175,6 +229,7 @@ void Llm::trace(bool start) { m->traceOrOptimize(status); } runtime_manager_->updateCache(); + mTracing = start; } VARP Llm::forward(const std::vector& input_ids) { @@ -185,11 +240,10 @@ VARP Llm::forward(const std::vector& input_ids) { if (is_single_) { // single model auto hidden_states = embedding(input_ids); - auto outputs = modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); + auto outputs = current_modules_.back()->onForward({hidden_states, attention_mask, position_ids, past_key_values_[0]}); if (outputs.empty()) { return nullptr; } - ExecutorScope::Current()->gc(Executor::FULL); logits = outputs[0]; past_key_values_[0] = outputs[1]; } else { @@ -199,14 +253,13 @@ VARP Llm::forward(const std::vector& input_ids) { ExecutorScope::Current()->gc(Executor::FULL); for (int i = 0; i < layer_nums; i++) { AUTOTIME; - auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); + auto outputs = current_modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]}); hidden_states = outputs[0]; past_key_values_[i] = outputs[1]; } - ExecutorScope::Current()->gc(Executor::FULL); { AUTOTIME; - auto outputs = modules_[layer_nums]->onForward({hidden_states}); + auto outputs = current_modules_[layer_nums]->onForward({hidden_states}); logits = outputs[0]; } } @@ -326,6 +379,7 @@ std::vector Llm::generate(const std::vector& input_ids, int max_new_to prompt_len_ = static_cast(input_ids.size()); if (max_new_tokens < 0) { max_new_tokens = config_->max_new_tokens(); } // prefill + current_modules_ = prefill_modules_; auto logits = forward(input_ids); if (logits.get() == nullptr) { return {}; @@ -334,6 +388,7 @@ std::vector Llm::generate(const std::vector& input_ids, int max_new_to output_ids.push_back(token); all_ids.push_back(token); // decode + current_modules_ = decode_modules_; while (gen_seq_len_ < max_new_tokens) { logits = forward({token}); if (logits.get() == nullptr) { @@ -348,17 +403,26 @@ std::vector Llm::generate(const std::vector& input_ids, int max_new_to } std::string Llm::generate(const std::vector& input_ids, std::ostream* os, const char* end_with) { + if (mTracing) { + // Skip real forward + current_modules_ = prefill_modules_; + forward(input_ids); + current_modules_ = decode_modules_; + forward({input_ids[0]}); + forward({input_ids[0]}); + return "Test"; + } prompt_len_ = static_cast(input_ids.size()); history_ids_.insert(history_ids_.end(), input_ids.begin(), input_ids.end()); // push to history_ids_ auto st = std::chrono::system_clock::now(); - modules_ = prefill_modules_; + current_modules_ = prefill_modules_; auto logits = forward(input_ids); if (nullptr == logits.get()) { return ""; } int token = sample(logits, history_ids_); auto et = std::chrono::system_clock::now(); - modules_ = decode_modules_; + current_modules_ = decode_modules_; std::string output_str = decode(token); prefill_us_ = std::chrono::duration_cast(et - st).count(); *os << output_str << std::flush; @@ -383,6 +447,7 @@ std::string Llm::generate(const std::vector& input_ids, std::ostream* os, c *os << word << std::flush; output_str += word; } + ExecutorScope::Current()->gc(Executor::FULL); #ifdef DUMP_PROFILE_INFO print_speed(); #endif @@ -414,9 +479,9 @@ std::string Llm::response(const std::vector& chat_prompts, std::ostr if (config_->reuse_kv() && all_seq_len_ > 0) { prompt = "<|im_end|>\n" + prompt; } - std::cout << "# prompt : " << prompt << std::endl; + // std::cout << "# prompt : " << prompt << std::endl; auto input_ids = tokenizer_->encode(prompt); - printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n"); + // printf("input_ids (%lu): ", input_ids.size()); for (auto id : input_ids) printf("%d, ", id); printf("\n"); return generate(input_ids, os, end_with); } @@ -443,6 +508,7 @@ Llm::~Llm() { MNN_PRINT("OP Summer: %.7f, Flops: %.7f, Speed: %.7f GFlops\n", opSummer, opFlopsSummber, opFlopsSummber/opSummer); } #endif + current_modules_.clear(); decode_modules_.clear(); prefill_modules_.clear(); modules_.clear(); diff --git a/transformers/llm/engine/src/llmconfig.hpp b/transformers/llm/engine/src/llmconfig.hpp index 71ef7291f..b09ab6177 100644 --- a/transformers/llm/engine/src/llmconfig.hpp +++ b/transformers/llm/engine/src/llmconfig.hpp @@ -227,10 +227,6 @@ class LlmConfig { bool reuse_kv() const { return config_.value("reuse_kv", false); } - - int quant_kv() const { - return config_.value("quant_kv", 0); - } // generate config end > // < backend config start @@ -249,6 +245,14 @@ class LlmConfig { std::string memory() const { return config_.value("memory", "low"); } + + int quant_kv() const { + return config_.value("quant_kv", 0); + } + + int kvcache_limit() const { + return config_.value("kvcache_limit", -1); + } // backend config end > // < llm model config start diff --git a/transformers/llm/engine/src/tokenizer.cpp b/transformers/llm/engine/src/tokenizer.cpp index 6330d8885..87f02c868 100644 --- a/transformers/llm/engine/src/tokenizer.cpp +++ b/transformers/llm/engine/src/tokenizer.cpp @@ -15,6 +15,7 @@ #include #include #include +#include namespace MNN { namespace Transformer { @@ -75,7 +76,7 @@ static std::string base64_decode(const std::string& str) { static inline void to_lower_case(std::string& str) { for (auto &c : str) { if (c >= 'A' && c <= 'Z') { - c = std::tolower(static_cast(c)); + c = tolower(static_cast(c)); } } } @@ -540,19 +541,19 @@ void BertTokenizer::encode(const std::string& str, std::vector& ids) { } } // handle continuous sequence of letters and digits - else if (std::isalnum(c)) { - while (i < str.size() && std::isalnum(static_cast(str[i]))) { - current_token += std::tolower(str[i]); + else if (isalnum(c)) { + while (i < str.size() && isalnum(static_cast(str[i]))) { + current_token += tolower(str[i]); ++i; } } // handle punctuation and symbols - else if (std::ispunct(c)) { + else if (ispunct(c)) { current_token = str[i]; ++i; } // handle space, tab, enter - else if (std::isspace(c)) { + else if (isspace(c)) { ++i; continue; }