diff --git a/docs/2024.html b/docs/2024.html
index 347e858a7f..6ecfb6f290 100644
--- a/docs/2024.html
+++ b/docs/2024.html
@@ -35,7 +35,7 @@
Simd Library Release Notes (2024).
Home
-May X, 2024 (version X.X.137)
+May 2, 2024 (version 6.1.137)
Algorithms
New features
diff --git a/docs/download.html b/docs/download.html
index c4a3beb936..94464509cd 100644
--- a/docs/download.html
+++ b/docs/download.html
@@ -27,6 +27,9 @@ Simd Library Download.
2024
Release Notes | Download Link | Size |
+ May 2, 2024 |
+ Simd-6.1.137.zip |
+ 5.8 MB |
April 2, 2024 |
Simd-6.1.136.zip |
5.7 MB |
diff --git a/prj/txt/UserVersion.txt b/prj/txt/UserVersion.txt
index 3f29014838..69bdf64511 100644
--- a/prj/txt/UserVersion.txt
+++ b/prj/txt/UserVersion.txt
@@ -1 +1 @@
-6.1.136
\ No newline at end of file
+6.1.137
\ No newline at end of file
diff --git a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp
index ba1edab79d..d2eeafd495 100644
--- a/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp
+++ b/src/Simd/SimdAmxBf16SynetConvolution16bNhwcGemm.cpp
@@ -37,12 +37,12 @@ namespace Simd
typedef Base::SynetConvolution16bNhwcGemm::AlgParam AlgParam;
typedef Base::SynetConvolution16bNhwcGemm::ConvolutionPtr Convolution;
- //-----------------------------------------------------------------------------------------
+#define SIMD_CONV_REORDER_TYPE 1
- #define SIMD_CONV_REORDER_TYPE 1
+ //-----------------------------------------------------------------------------------------
- static void Convert16bNhwcGemm(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
- {
+ static void Convert16bNhwcGemmD(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
+ {
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32);
__mmask16 srcMask[2];
@@ -59,7 +59,6 @@ namespace Simd
for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
{
uint16_t* row = dst + dr * a.bufK;
-
for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
{
size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
@@ -96,7 +95,71 @@ namespace Simd
}
}
- static void Convert16bNhwcGemm1x1(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
+ static void Convert16bNhwcGemmR(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
+ {
+ const float* src = (float*)src8;
+ size_t srcC32 = AlignLo(p.srcC, 32);
+ assert(p.srcC == srcC32);
+ for (size_t dy = yBeg, dr = (a.macroK < a.bufK ? dy * p.dstW : 0) + b * p.dstH * p.dstW; dy < yEnd; ++dy)
+ {
+ for (size_t dx = 0; dx < p.dstW; ++dx, ++dr)
+ {
+ uint16_t* row = dst + dr * a.bufK;
+ for (size_t ky = 0, k = 0; ky < p.kernelY; ky++)
+ {
+ size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
+ if (sy < p.srcH)
+ {
+ for (size_t kx = 0; kx < p.kernelX; kx++)
+ {
+ size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
+ if (sx < p.srcW)
+ {
+ const float* ps = src + (sy * p.srcW + sx) * p.srcC;
+ for (size_t sc = 0; sc < srcC32; sc += 32)
+ ConvertA(ps + sc, row + sc);
+ row += p.srcC;
+ }
+ else
+ {
+ for (size_t sc = 0; sc < srcC32; sc += 32)
+ SetZero(row + sc);
+ row += p.srcC;
+ }
+ }
+ }
+ else
+ {
+ for (size_t sc = 0, n = p.kernelX * p.srcC; sc < n; sc += 32)
+ SetZero(row + sc);
+ row += p.kernelX * p.srcC;
+ }
+ }
+ }
+ }
+ }
+
+ static void Convert16bNhwcGemm1x1D(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
+ {
+ const float* src = (float*)src8;
+ size_t srcC32 = AlignLo(p.srcC, 32), n = (yEnd - yBeg) * p.dstW;
+ __mmask16 srcMask0 = TailMask16(p.srcC - srcC32 - F * 0);
+ __mmask16 srcMask1 = TailMask16(p.srcC - srcC32 - F * 1);
+ src += yBeg * p.srcW * p.srcC;
+ dst += ((a.macroK < a.bufK ? yBeg * p.dstW : 0) + b * p.dstH * p.dstW) * a.bufK;
+ for (size_t i = 0; i < n; ++i)
+ {
+ size_t sc = 0;
+ for (; sc < srcC32; sc += 32)
+ ConvertA(src + sc, dst + sc);
+ if (srcC32 < p.srcC)
+ ConvertA(src + sc, dst + sc, srcMask0, srcMask1);
+ src += p.srcC;
+ dst += a.bufK;
+ }
+ }
+
+ static void Convert16bNhwcGemm1x1R(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
{
const float* src = (float*)src8;
size_t srcC32 = AlignLo(p.srcC, 32), n = (yEnd - yBeg) * p.dstW;
@@ -104,7 +167,6 @@ namespace Simd
__mmask16 srcMask1 = TailMask16(p.srcC - srcC32 - F * 1);
src += yBeg * p.srcW * p.srcC;
dst += ((a.macroK < a.bufK ? yBeg * p.dstW : 0) + b * p.dstH * p.dstW) * a.bufK;
-#if SIMD_CONV_REORDER_TYPE
for (size_t i = 0; i < n; i += 16)
{
size_t m = Min(i + 16, n) - i;
@@ -114,32 +176,16 @@ namespace Simd
size_t j = 0;
for(; j < m; ++j)
ConvertA(src + sc + j * p.srcC, dst + j * 32 + sc * 16);
- for (; j < 16; ++j)
- SetZero(dst + j * 32 + sc * 16);
}
if (srcC32 < p.srcC)
{
size_t j = 0;
for (; j < m; ++j)
ConvertA(src + sc + j * p.srcC, dst + j * 32 + sc * 16, srcMask0, srcMask1);
- for (; j < 16; ++j)
- SetZero(dst + j * 32 + sc * 16);
}
src += p.srcC * 16;
dst += a.bufK * 16;
}
-#else
- for (size_t i = 0; i < n; ++i)
- {
- size_t sc = 0;
- for (; sc < srcC32; sc += 32)
- ConvertA(src + sc, dst + sc);
- if (srcC32 < p.srcC)
- ConvertA(src + sc, dst + sc, srcMask0, srcMask1);
- src += p.srcC;
- dst += a.bufK;
- }
-#endif
}
static void Reorder16bNhwcGemm(const uint8_t* src8, const ConvParam& p, const AlgParam& a, size_t b, size_t yBeg, size_t yEnd, uint16_t* dst)
@@ -479,12 +525,26 @@ namespace Simd
{
if (_is1x1)
{
- _convert = Convert16bNhwcGemm1x1;
- a.reorderType = SIMD_CONV_REORDER_TYPE;
+#if SIMD_CONV_REORDER_TYPE
+ _convert = Convert16bNhwcGemm1x1R;
+ a.reorderType = 1;
+#else
+ _convert = Convert16bNhwcGemm1x1D;
+ a.reorderType = 0;
+#endif
}
else
{
- _convert = Convert16bNhwcGemm;
+ if (p.srcC == AlignLo(p.srcC, 32))
+ {
+ _convert = Convert16bNhwcGemmR;
+ a.reorderType = 0;
+ }
+ else
+ {
+ _convert = Convert16bNhwcGemmD;
+ a.reorderType = 0;
+ }
}
}
switch (p.activation)
diff --git a/src/Simd/SimdSynetConvolution16bCommon.h b/src/Simd/SimdSynetConvolution16bCommon.h
index 6fb86ac6c6..899eaa4c5f 100644
--- a/src/Simd/SimdSynetConvolution16bCommon.h
+++ b/src/Simd/SimdSynetConvolution16bCommon.h
@@ -433,6 +433,20 @@ namespace Simd
_mm512_mask_storeu_epi16(dst, dstMask, _mm512_setzero_si512());
}
+ SIMD_INLINE void Copy(const uint16_t* src, uint16_t* dst, __mmask32 srcMask = __mmask32(-1), __mmask32 dstMask = __mmask32(-1))
+ {
+ _mm512_mask_storeu_epi16(dst, dstMask, _mm512_maskz_loadu_epi16(srcMask, src));
+ }
+
+ SIMD_INLINE void Copy(const uint16_t* src, uint16_t* dst, size_t size32, __mmask32 tail = __mmask32(0))
+ {
+ size_t i = 0;
+ for(; i < size32; i += 32)
+ _mm512_storeu_epi16(dst + i, _mm512_loadu_epi16(src + i));
+ if(tail)
+ _mm512_mask_storeu_epi16(dst + i, tail, _mm512_maskz_loadu_epi16(tail, src + i));
+ }
+
//-------------------------------------------------------------------------------------------------
template struct Term16b