diff --git a/source/backend/cpu/arm/kleidiAI/mnn_kleidiai.h b/source/backend/cpu/arm/kleidiAI/mnn_kleidiai.h index 0c8b73059..38cdce230 100644 --- a/source/backend/cpu/arm/kleidiAI/mnn_kleidiai.h +++ b/source/backend/cpu/arm/kleidiAI/mnn_kleidiai.h @@ -30,9 +30,9 @@ namespace MNN { class KleidiAI { public: - static KleidiAI &getInstance(bool bAsymmetric) { + static KleidiAI &getInstance(bool bAsymmetric, bool acthalf, bool blockwise) { if(!instance) { - instance = new KleidiAI(bAsymmetric); + instance = new KleidiAI(bAsymmetric, acthalf, blockwise); } return *instance; } @@ -49,6 +49,8 @@ namespace MNN { typedef struct KaiInfo { bool kaiEnable = false; bool asymmetric = false; //Asymmetric quantized model. + bool acthalf = false; // activation half precision. + bool blockwise = false; // weight quant using block wise. bool dot = false; //CPU support sdot. bool i8mm = false; //CPU support i8mm. } KaiInfo; @@ -62,7 +64,10 @@ namespace MNN { void setModelAsymmetric(bool bAsymmetric); //Check - bool canAccelerate() { return (mKaiInfo.kaiEnable && mKaiInfo.dot && mKaiInfo.i8mm && !mKaiInfo.asymmetric); } + bool canAccelerate() { + return (mKaiInfo.kaiEnable && mKaiInfo.dot && mKaiInfo.i8mm && + !mKaiInfo.asymmetric && !mKaiInfo.acthalf && !mKaiInfo.blockwise); + } //Get info size_t getMr(size_t m = 1) { return (m == 1) ? mKaiMrDotprod : mKaiMrI8mm; } @@ -90,12 +95,15 @@ namespace MNN { void runMatmul(size_t m, size_t n, size_t k, const void* lhsPacked, const void* rhsPacked, size_t dst_stride, void* dst); private: - KleidiAI(bool bAsymmetric = false) { + KleidiAI(bool bAsymmetric = false, bool acthalf = false, bool blockwise = false) { const MNNCPUInfo& gCPUInfo = *MNNGetCPUInfo(); mKaiInfo.dot = gCPUInfo.dot; mKaiInfo.i8mm = gCPUInfo.i8mm; mKaiInfo.kaiEnable = true; mKaiInfo.asymmetric = bAsymmetric; + mKaiInfo.acthalf = acthalf; + mKaiInfo.blockwise = blockwise; + if(canAccelerate()) { MNN_PRINT("\nKleidiAI is running!\n"); } diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp index bdaa08045..4788d88c3 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp @@ -83,7 +83,7 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS for (int y = 0; y < ic; ++y) { const int yOutSide = y / SRC_UNIT; const int yInSide = y % SRC_UNIT; - + int blockId = (yOutSide + k * icDivU) / blockL; int blockInsideId = (yOutSide + k * icDivU) % blockL; @@ -268,9 +268,13 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O int oc = convOp->common()->outputCount(); int ic = convOp->common()->inputCount(); bool directReadInt4weight = (kernelCount == 1 && ROUND_UP(oc, UNIT) == oc && ROUND_UP(ic, SRC_UNIT) == ic); - + #ifdef MNN_KLEIDIAI_ENABLED - KleidiAI kai = KleidiAI::getInstance(quanCommon->asymmetric); + bool half_act = gcore->bytes == 2; + int biasSize = mResourceInt8->mOriginBias->size(); + int alphaSize = mResourceInt8->mOriginScale->size(); + bool blockwise = (biasSize * 2) != alphaSize; + KleidiAI kai = KleidiAI::getInstance(quanCommon->asymmetric, half_act, blockwise); if(quanCommon->canUseInt4 && kai.canAccelerate()) { int n = oc; int k = ic; @@ -294,9 +298,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O return; } - + #endif - + if (quanCommon->canUseInt4 && directReadInt4weight) { // int4 weight reorder mResourceInt8->mWeightAsymmetricQuant = true; @@ -305,7 +309,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O int lU = UP_DIV(ic, SRC_UNIT); int hP = UNIT; int lP = SRC_UNIT; - + // weight shape. std::vector shape; if (SRC_UNIT > pack) { @@ -337,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O int blockkInsideId = j % blockL; for (int k = 0; k < cnt; ++k) { int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k); - + int hpId0 = (2 * k + 1) / lP; int lpId0 = (2 * k) % lP; int hpId1 = (2 * (k + cnt) + 1) / lP; @@ -350,7 +354,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O int s3 = (srcPtr[srcIndx1] & 15); int d0 = s0 * 16 + s2; int d1 = s1 * 16 + s3; - + dstPtr[dstIndx0] = d0; dstPtr[dstIndx0 + 1] = d1; } @@ -358,7 +362,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O } } else { // std::shared_ptr srcWeight; - + if (quanCommon->canUseInt4) { mResourceInt8->mWeightAsymmetricQuant = true; auto srcPtr = reinterpret_cast(quanCommon->weight.get()); @@ -392,7 +396,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O dst0[j] = d; } } - + // Update int4 weight to mWeightInt8. mResourceInt8->mWeightInt8 = weightLow; } else { @@ -434,7 +438,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st auto alphaPtr = scaleBias->host(); auto biasPtr = reinterpret_cast(reinterpret_cast(alphaPtr) + ocUp4 * core->bytes); ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes); - + // Load quant scale and bias weightOrigin = resourceInt8->mWeightInt8->host(); auto wZero = resourceInt8->mWeightQuantZero->host(); // has packed to outputUp4 @@ -454,7 +458,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st } } resourceInt8->mOriginScale = scaleBias; - + // Compute float weightKernelSum resourceInt8->mWeightKernelSum.reset(Tensor::createDevice({ocUp4 * 4})); success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC);