From f947de6635d6bd9f4933fb4c21e8326d85d9638b Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Fri, 12 Apr 2024 12:16:51 +0100 Subject: [PATCH] Add alpha support for SVE2.1 (#257) This patch adds new intrinsics and types for supporting SVE2.1. --- main/acle.md | 1621 +++++++++++++++++++++++++++++++------------------- 1 file changed, 999 insertions(+), 622 deletions(-) diff --git a/main/acle.md b/main/acle.md index 190b5cc5..380e5a32 100644 --- a/main/acle.md +++ b/main/acle.md @@ -1855,6 +1855,10 @@ SVE language extensions: the Armv9-A SVE2 extension (FEAT_SVE2) and if the associated ACLE intrinsics are available. This implies that `__ARM_FEATURE_SVE` is nonzero. +`__ARM_FEATURE_SVE2p1` is defined to 1 if the FEAT_SVE2p1 instructions + are available and if the associated [ACLE features] +(#sme-language-extensions-and-intrinsics) are supported. + #### NEON-SVE Bridge macros `__ARM_NEON_SVE_BRIDGE` is defined to 1 if [NEON-SVE Bridge](#neon-sve-bridge) @@ -2373,6 +2377,7 @@ be found in [[BA]](#BA). | [`__ARM_FEATURE_SVE2_SHA3`](#sha3-extension) | SVE2 support for the SHA3 cryptographic extension (FEAT_SVE_SHA3) | 1 | | [`__ARM_FEATURE_SVE2_SM3`](#sm3-extension) | SVE2 support for the SM3 cryptographic extension (FEAT_SVE_SM3) | 1 | | [`__ARM_FEATURE_SVE2_SM4`](#sm4-extension) | SVE2 support for the SM4 cryptographic extension (FEAT_SVE_SM4) | 1 | +| [`__ARM_FEATURE_SVE2p1`](#sve2) | SVE version 2.1 (FEAT_SVE2p1) | [`__ARM_FEATURE_SYSREG128`](#bit-system-registers) | Support for 128-bit system registers (FEAT_SYSREG128) | 1 | | [`__ARM_FEATURE_UNALIGNED`](#unaligned-access-supported-in-hardware) | Hardware support for unaligned access | 1 | | [`__ARM_FP`](#hardware-floating-point) | Hardware floating-point | 1 | @@ -8640,6 +8645,397 @@ This is because the ACLE intrinsic calls do not imply a particular register allocation and so the code generator must decide for itself when move instructions are required. + +### SVE2 BFloat16 data-processing instructions. + +The instructions in this section are available when __ARM_FEATURE_B16B16 is +non-zero. + +#### BFADD, BFSUB + +BFloat16 floating-point add and sub (vectors) + +``` c + svbfloat16_t svadd[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svadd[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svadd[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svadd[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svadd[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svadd[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + + svbfloat16_t svsub[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svsub[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svsub[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svsub[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svsub[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svsub[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + ``` + +#### BFCLAMP + +BFloat16 Clamp to minimum/maximum vector. + +``` c + svbfloat16_t svclamp[_bf16](svbfloat16_t op, svbfloat16_t min, svbfloat16_t max); + ``` + +#### BFMAX, BFMIN + +BFloat16 floating-point maximum/minimum (predicated). + + ``` c + svbfloat16_t svmax[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmax[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmax[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmax[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmax[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmax[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + + svbfloat16_t svmin[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmin[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmin[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmin[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmin[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmin[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + ``` + +#### BFMAXNM, BFMINNM + +BFloat16 floating-point maximum/minimum number (predicated). + + ``` c + svbfloat16_t svmaxnm[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmaxnm[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmaxnm[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmaxnm[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmaxnm[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmaxnm[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + + svbfloat16_t svminnm[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svminnm[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svminnm[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svminnm[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svminnm[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svminnm[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + ``` + +#### BFMLA, BFMLS +BFloat16 floating-point fused multiply add or sub vectors. + + ``` c + svbfloat16_t svmla[_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmla[_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmla[_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmla[_n_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + svbfloat16_t svmla[_n_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + svbfloat16_t svmla[_n_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + + svbfloat16_t svmla_lane[_bf16](svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + svbfloat16_t svmls[_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmls[_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmls[_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm); + svbfloat16_t svmls[_n_bf16]_m(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + svbfloat16_t svmls[_n_bf16]_z(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + svbfloat16_t svmls[_n_bf16]_x(svbool_t pg, svbfloat16_t zda, svbfloat16_t zn, + bfloat16_t zm); + + svbfloat16_t svmls_lane[_bf16](svbfloat16_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + ``` + +#### BFMUL + +BFloat16 floating-point multiply vectors. + + ``` c + svbfloat16_t svmul[_bf16]_m(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmul[_bf16]_x(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmul[_bf16]_z(svbool_t pg, svbfloat16_t zdn, svbfloat16_t zm); + svbfloat16_t svmul[_n_bf16]_m(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmul[_n_bf16]_x(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + svbfloat16_t svmul[_n_bf16]_z(svbool_t pg, svbfloat16_t zdn, bfloat16_t zm); + + svbfloat16_t svmul_lane[_bf16](svbfloat16_t zn, svbfloat16_t zm, + uint64_t imm_idx); + ``` + +### SVE2.1 instruction intrinsics + +The functions in this section are defined by the header file + [``](#arm_sve.h) when `__ARM_FEATURE_SVE2p1` is defined. + +Some instructions overlap with the SME and SME2 architecture extensions and +are additionally available in Streaming SVE mode when __ARM_FEATURE_SME is +non-zero or __ARM_FEATURE_SME2 are non-zero. +For convenience, the intrinsics for these instructions are listed in the + following section. + +#### Multi-vector predicates + +When `__ARM_FEATURE_SVE2p1` is defined, [``](#arm_sve.h) defines the +tuple types `svboolx2_t` and `svboolx4_t`. + +These are opaque tuple types that can be accessed using the SVE intrinsics +`svsetN`, `svgetN` and `svcreateN`. `svundef2` and `svundef4` are also extended +to work with `svboolx2_t` and `svboolx4_t`. e.g. + +``` c + svbool_t svget2[_b](svboolx2_t tuple, uint64_t imm_index); + svboolx2_t svset2[_b](svboolx2_t tuple, uint64_t imm_index, svbool_t x); + svboolx2_t svcreate2[_b](svbool_t x, svbool_t y); + svboolx2_t svundef2_b(); +``` + +#### ADDQV, FADDQV + +Unsigned/FP add reduction of quadword vector segments. + +``` c + // Variants are also available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64, + // _f16, _f32, _f64 + uint8x16_t svaddqv[_u8](svbool_t pg, svuint8_t zn); + ``` + +#### ANDQV, EORQV, ORQV + +Reduction of quadword vector segments. + +``` c + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + uint8x16_t svandqv[_u8](svbool_t pg, svuint8_t zn); + uint8x16_t sveorqv[_u8](svbool_t pg, svuint8_t zn); + uint8x16_t svorqv[_u8](svbool_t pg, svuint8_t zn); + ``` + +#### DUPQ + +Broadcast indexed element within each quadword vector segment. + +``` c + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svuint8_t svdup_laneq[_u8](svuint8_t zn, uint64_t imm_idx); + ``` + +#### EXTQ + +Extract vector segment from each pair of quadword segments. + +``` c + // Variants are also available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + // _bf16, _f16, _f32, _f64 + svuint8_t svextq[_u8](svuint8_t zdn, svuint8_t zm, uint64_t imm); + ``` +#### LD1D, LD1W + +Contiguous zero-extending load to quadword (single vector). + +``` c + // Variants are also available for: + // _u32, _s32 + svfloat32_t svld1uwq[_f32](svbool_t, const float32_t *ptr); + svfloat32_t svld1uwq_vnum[_f32](svbool_t, const float32_t *ptr, int64_t vnum); + + + // Variants are also available for: + // _u64, _s64 + svfloat64_t svld1udq[_f64](svbool_t, const float64_t *ptr); + svfloat64_t svld1udq_vnum[_f64](svbool_t, const float64_t *ptr, int64_t vnum); + ``` + +#### LD1Q + +Gather Load Quadword. + +``` c + // Variants are also available for: + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint8_t svld1q_gather[_u64base]_s8(svbool_t pg, svuint64_t zn); + svint8_t svld1q_gather[_u64base]_offset_s8(svbool_t pg, svuint64_t zn, int64_t offset); + svint8_t svld1q_gather_[u64]offset[_s8](svbool_t pg, const int8_t *base, svuint64_t offset); + + + // Variants are also available for: + // _u16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint16_t svld1q_gather_[u64]index[_s16](svbool_t pg, const int16_t *base, svuint64_t index); + svint8_t svld1q_gather[_u64base]_index_s8(svbool_t pg, svuint64_t zn, int64_t index); + ``` + +#### LD2Q, LD3Q, LD4Q + +Contiguous load two, three or four quadword structures. + +``` c + // Variants are also available for: + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint8x2_t svld2q[_s8](svbool_t pg, const int8_t *rn); + svint8x2_t svld2q_vnum[_s8](svbool_t pg, const int8_t *rn, uint64_t vnum); + svint8x3_t svld3q[_s8](svbool_t pg, const int8_t *rn); + svint8x3_t svld3q_vnum[_s8](svbool_t pg, const int8_t *rn, uint64_t vnum); + svint8x4_t svld4q[_s8](svbool_t pg, const int8_t *rn); + svint8x4_t svld4q_vnum[_s8](svbool_t pg, const int8_t *rn, uint64_t vnum); + ``` + +#### UMAXQV, SMAXQV, FMAXQV, UMINQV, SMINQV, FMINQV + +Max/Min reduction of quadword vector segments. + +``` c + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _f16, _f32, _f64 + uint8x16_t svmaxqv[_u8](svbool_t pg, svuint8_t zn); + uint8x16_t svminqv[_u8](svbool_t pg, svuint8_t zn); + ``` + +#### FMAXNMQV, FMINNMQV + +Max/Min recursive reduction of quadword vector segments. + +``` c + // Variants are also available for _f32, _f64 + float16x8_t svmaxnmqv[_f16](svbool_t pg, svfloat16_t zn); + float16x8_t svminnmqv[_f16](svbool_t pg, svfloat16_t zn); + ``` + +#### PMOV + +``` c + // Variants are available for: + // _s8, _u16, _s16, _s32, _u32, _s64, _u64 + svbool_t svpmov_lane[_u8](svuint8_t zn, uint64_t imm); + + // Variants are available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + svbool_t svpmov[_u8](svuint8_t zn); + + // Variants are available for: + // _s16, _s32, _u32, _s64, _u64 + svuint16_t svpmov_lane[_u16]_m(svuint16_t zd, svbool_t pn, uint64_t imm); + + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + svuint8_t svpmov_u8_z(svbool_t pn); + ``` + +#### ST1D, ST1W + +Contiguous store of single vector operand, truncating from quadword. + +``` c + // Variants are also available for: + // _u32, _s32 + void svst1wq[_f32](svbool_t, const float32_t *ptr, svfloat32_t data); + void svst1wq_vnum[_f32](svbool_t, const float32_t *ptr, int64_t vnum, svfloat32_t data); + + + // Variants are also available for: + // _u64, _s64 + void svst1dq[_f64](svbool_t, const float64_t *ptr, svfloat64_t data); + void svst1dq_vnum[_f64](svbool_t, const float64_t *ptr, int64_t vnum, svfloat64_t data); + ``` + +#### ST1Q + +Scatter store quadwords. + +``` c + // Variants are also available for: + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + void svst1q_scatter[_u64base][_s8](svbool_t pg, svuint64_t zn, svint8_t data); + void svst1q_scatter[_u64base]_offset[_s8](svbool_t pg, svuint64_t zn, int64_t offset, svint8_t data); + void svst1q_scatter_[u64]offset[_s8](svbool_t pg, const uint8_t *base, svuint64_t offset, svint8_t data); + + // Variants are also available for: + // _u16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + void svst1q_scatter[_u64base]_index[_s8](svbool_t pg, svuint64_t zn, int64_t index, svint8_t data); + void svst1q_scatter_[u64]index_[s16](svbool_t pg, const int16_t *base, svuint64_t index, svint16_t data); + ``` + +#### ST2Q, ST3Q, ST4Q + +Contiguous store. + +``` c + // Variants are also available for: + // _s8 _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + void svst2q[_u8](svbool_t pg, uint8_t *rn, svuint8x2_t zt); + void svst2q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x2_t zt); + void svst3q[_u8](svbool_t pg, uint8_t *rn, svuint8x3_t zt); + void svst3q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x3_t zt); + void svst4q[_u8](svbool_t pg, uint8_t *rn, svuint8x4_t zt); + void svst4q_vnum[_u8](svbool_t pg, uint8_t *rn, int64_t vnum, svuint8x4_t zt); + ``` + +#### TBLQ + +Programmable table lookup within each quadword vector segment (zeroing). + +``` c + // Variants are also available for: + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint8_t svtblq[_s8](svint8_t zn, svuint8_t zm); + ``` + +#### TBXQ + +Programmable table lookup within each quadword vector segment (merging). + +``` c + // Variants are also available for: + // _u8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svint8_t svtbxq[_s8](svint8_t fallback, svint8_t zn, svuint8_t zm); + ``` + +#### UZPQ1, UZPQ2 + +Concatenate elements within each pair of quadword vector segments. + +``` c + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svuint8_t svuzpq1[_u8](svuint8_t zn, svuint8_t zm); + svuint8_t svuzpq2[_u8](svuint8_t zn, svuint8_t zm); + ``` + +#### ZIPQ1, ZIPQ2 + +Interleave elements from halves of each pair of quadword vector segments. + +``` c + // Variants are also available for: + // _s8, _u16, _s16, _u32, _s32, _u64, _s64 + // _bf16, _f16, _f32, _f64 + svuint8_t svzipq1[_u8](svuint8_t zn, svuint8_t zm); + svuint8_t svzipq2[_u8](svuint8_t zn, svuint8_t zm); + ``` + # SME language extensions and intrinsics The specification for SME is in @@ -10167,37 +10563,11 @@ Multi-vector saturating extract narrow Multi-vector saturating extract narrow and interleave ``` c - // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] - svint16_t svqcvtn_s16[_s32_x2](svint32x2_t zn) __arm_streaming_compatible; - - // Variants are also available for _u8[_s32_x4], _u8[_u32_x4], _s16[_s64_x4], // _u16[_s64_x4] and _u16[_u64_x4] svint8_t svqcvtn_s8[_s32_x4](svint32x4_t zn) __arm_streaming; ``` -#### UDOT, SDOT, FDOT (vectors) - -Multi-vector dot-product (2-way) - -``` c - // Variants are also available for _s32_s16 and _u32_u16 - svfloat32_t svdot[_f32_f16](svfloat32_t zda, svfloat16_t zn, - svfloat16_t zm) - __arm_streaming_compatible; - ``` - -#### UDOT, SDOT, FDOT (indexed) - -Multi-vector dot-product (2-way) - -``` c - // Variants are also available for _s32_s16 and _u32_u16 - svfloat32_t svdot_lane[_f32_f16](svfloat32_t zda, svfloat16_t zn, - svfloat16_t zm, uint64_t imm_idx) - __arm_streaming_compatible; - ``` - #### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, single) Multi-vector dot-product (2-way and 4-way) @@ -10913,29 +11283,6 @@ Multi-vector multiply-subtract long long (widening) __arm_streaming __arm_inout("za"); ``` -#### BFMLSLB, BFMLSLT - -BFloat16 floating-point multiply-subtract long from single-precision (top/bottom) - -``` c - svfloat32_t svbfmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) - __arm_streaming_compatible; - - - svfloat32_t svbfmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx) - __arm_streaming_compatible; - - - svfloat32_t svbfmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) - __arm_streaming_compatible; - - - svfloat32_t svbfmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx) - __arm_streaming_compatible; - ``` - #### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (single) Multi-vector min/max @@ -11077,755 +11424,888 @@ Multi-vector floating-point round to integral value svfloat32x4_t svrintp[_f32_x4](svfloat32x4_t zn) __arm_streaming; ``` -#### LD1B, LD1D, LD1H, LD1W +#### LDR, STR -Contiguous load to multi-vector +Spill and fill of ZT0 ``` c - // Variants are also available for _s8 - svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn) - __arm_streaming; - + void svldr_zt(uint64_t zt, const void *rn) + __arm_streaming_compatible __arm_inout("zt0"); - // Variants are also available for _s8 - svuint8x4_t svld1[_u8]_x4(svcount_t png, const uint8_t *rn) - __arm_streaming; + void svstr_zt(uint64_t zt, void *rn) + __arm_streaming_compatible __arm_in("zt0"); + ``` - // Variants are also available for _s8 - svuint8x2_t svld1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum) - __arm_streaming; +#### ZERO +Zero ZT0 - // Variants are also available for _s8 - svuint8x4_t svld1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum) - __arm_streaming; +``` c + void svzero_zt(uint64_t zt) + __arm_streaming_compatible __arm_out("zt0"); + ``` +#### LUTI2, LUTI4 - // Variants are also available for _s16, _f16 and _bf16 - svuint16x2_t svld1[_u16]_x2(svcount_t png, const uint16_t *rn) - __arm_streaming; +Lookup table read with 2-bit and 4-bit indexes +``` c + // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, + // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 + svint8_t svluti2_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); - // Variants are also available for _s16, _f16 and _bf16 - svuint16x4_t svld1[_u16]_x4(svcount_t png, const uint16_t *rn) - __arm_streaming; + // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, + // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 + svint8x2_t svluti2_lane_zt_s8_x2(uint64_t zt, svuint8_t zn, + uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); - // Variants are also available for _s16, _f16 and _bf16 - svuint16x2_t svld1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum) - __arm_streaming; + // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, + // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 + svint8x4_t svluti2_lane_zt_s8_x4(uint64_t zt, svuint8_t zn, + uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); - // Variants are also available for _s16, _f16 and _bf16 - svuint16x4_t svld1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum) - __arm_streaming; + // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, + // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 + svint8_t svluti4_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); - // Variants are also available for _s32 and _f32 - svuint32x2_t svld1[_u32]_x2(svcount_t png, const uint32_t *rn) - __arm_streaming; + // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, + // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 + svint8x2_t svluti4_lane_zt_s8_x2(uint64_t zt, svuint8_t zn, + uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); - // Variants are also available for _s32 and _f32 - svuint32x4_t svld1[_u32]_x4(svcount_t png, const uint32_t *rn) - __arm_streaming; + // Variants are also available for _zt_u16, _zt_f16, _zt_bf16, _zt_s32, + // _zt_u32 and _zt_f32 + svint16x4_t svluti4_lane_zt_s16_x4(uint64_t zt, svuint8_t zn, + uint64_t imm_idx) + __arm_streaming __arm_in("zt0"); + ``` - // Variants are also available for _s32 and _f32 - svuint32x2_t svld1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum) - __arm_streaming; +#### MOVA +Move multi-vectors to/from ZA - // Variants are also available for _s32 and _f32 - svuint32x4_t svld1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum) - __arm_streaming; +``` c + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_in("za"); - // Variants are also available for _s64 and _f64 - svuint64x2_t svld1[_u64]_x2(svcount_t png, const uint64_t *rn) - __arm_streaming; + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_in("za"); - // Variants are also available for _s64 and _f64 - svuint64x4_t svld1[_u64]_x4(svcount_t png, const uint64_t *rn) - __arm_streaming; + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_in("za"); - // Variants are also available for _s64 and _f64 - svuint64x2_t svld1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum) - __arm_streaming; + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_in("za"); - // Variants are also available for _s64 and _f64 - svuint64x4_t svld1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum) - __arm_streaming; + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x2_t svread_za8_s8_vg1x2(uint32_t slice) + __arm_streaming __arm_in("za"); + + + // Variants are also available for _za8_u8, _za16_s16, _za16_u16, + // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, + // _za64_s64, _za64_u64 and _za64_f64 + svint8x4_t svread_za8_s8_vg1x4(uint32_t slice) + __arm_streaming __arm_in("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn) + __arm_streaming __arm_inout("za"); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64[_f64] + void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn) + __arm_streaming __arm_inout("za"); ``` -#### LDNT1B, LDNT1D, LDNT1H, LDNT1W +#### UCLAMP, SCLAMP, FCLAMP -Contiguous non-temporal load to multi-vector +Multi-vector clamp to minimum/maximum vector ``` c - // Variants are also available for _s8 - svuint8x2_t svldnt1[_u8]_x2(svcount_t png, const uint8_t *rn) + // Variants are also available for _single_s8_x2, _single_u8_x2, + // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, + // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 + svfloat16x2_t svclamp[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zn, + svfloat16_t zm) __arm_streaming; - // Variants are also available for _s8 - svuint8x4_t svldnt1[_u8]_x4(svcount_t png, const uint8_t *rn) + // Variants are also available for _single_s8_x4, _single_u8_x4, + // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, + // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 + svfloat16x4_t svclamp[_single_f16_x4](svfloat16x4_t zd, svfloat16_t zn, + svfloat16_t zm) __arm_streaming; + ``` - // Variants are also available for _s8 - svuint8x2_t svldnt1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum) - __arm_streaming; +#### SEL +Multi-vector conditionally select elements from two vectors - // Variants are also available for _s8 - svuint8x4_t svldnt1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum) +``` c + // Variants are also available for _s8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming; - // Variants are also available for _s16, _f16 and _bf16 - svuint16x2_t svldnt1[_u16]_x2(svcount_t png, const uint16_t *rn) + // Variants are also available for _s8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming; + ``` +#### URSHL, SRSHL (single) - // Variants are also available for _s16, _f16 and _bf16 - svuint16x4_t svldnt1[_u16]_x4(svcount_t png, const uint16_t *rn) - __arm_streaming; +Multi-vector rounding shift left +``` c + // Variants are also available for _single_u8_x2, _single_u16_x2, + // _single_s16_x2, _single_u32_x2, _single_s32_x2, _single_u64_x2 + // and _single_s64_x2 + svint8x2_t svrshl[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; - // Variants are also available for _s16, _f16 and _bf16 - svuint16x2_t svldnt1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum) - __arm_streaming; + // Variants are also available for _single_u8_x4, _single_u16_x4, + // _single_s16_x4, _single_u32_x4, _single_s32_x4, _single_u64_x4 + // and _single_s64_x4 + svint8x4_t svrshl[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; + ``` - // Variants are also available for _s16, _f16 and _bf16 - svuint16x4_t svldnt1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum) - __arm_streaming; +#### URSHL, SRSHL (multi) + +Multi-vector rounding shift left +``` c + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _u32_x2, _s32_x2, + // _u64_x2 and _s64_x2 + svint8x2_t svrshl[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; - // Variants are also available for _s32 and _f32 - svuint32x2_t svldnt1[_u32]_x2(svcount_t png, const uint32_t *rn) + + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _u32_x4, _s32_x4, + // _u64_x4 and _s64_x4 + svint8x4_t svrshl[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; + ``` + +#### SQRSHR, UQRSHR + +Multi-vector saturating rounding shift right narrow + +``` c + // Variants are also available for _u8[_u32_x4] + svint8_t svqrshr[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _s32 and _f32 - svuint32x4_t svldnt1[_u32]_x4(svcount_t png, const uint32_t *rn) + // Variants are also available for _u16[_u32_x2] + svint16_t svqrshr[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _s32 and _f32 - svuint32x2_t svldnt1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum) + // Variants are also available for _u16[_u64_x4] + svint16_t svqrshr[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; + ``` +#### SQRSHRN, UQRSHRN - // Variants are also available for _s32 and _f32 - svuint32x4_t svldnt1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum) +Multi-vector saturating rounding shift right narrow and interleave + +``` c + // Variants are also available for _u8[_u32_x4] + svint8_t svqrshrn[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _s64 and _f64 - svuint64x2_t svldnt1[_u64]_x2(svcount_t png, const uint64_t *rn) + // Variants are also available for _u16[_u64_x4] + svint16_t svqrshrn[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; + ``` +#### SQRSHRU - // Variants are also available for _s64 and _f64 - svuint64x4_t svldnt1[_u64]_x4(svcount_t png, const uint64_t *rn) +Multi-vector saturating rounding shift right unsigned narrow + +``` c + svuint8_t svqrshru[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _s64 and _f64 - svuint64x2_t svldnt1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum) + svuint16_t svqrshru[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _s64 and _f64 - svuint64x4_t svldnt1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum) + svuint16_t svqrshru[_n]_u16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` -#### ST1B, ST1D, ST1H, ST1W +#### SQRSHRUN -Contiguous store of multi-vector operand +Multi-vector saturating rounding shift right unsigned narrow and interleave ``` c - // Variants are also available for _s8_x2 - void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt) + // Variants are also available for _u16[_s64_x4] + svuint8_t svqrshrun[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; + ``` + +#### SQDMULH (single) +Multi-vector signed saturating doubling multiply high - // Variants are also available for _s8_x4 - void svst1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt) +``` c + // Variants are also available for _single_s16_x2, _single_s32_x2 + // and _single_s64_x2 + svint8x2_t svqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; - // Variants are also available for _s8_x2 - void svst1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt) + // Variants are also available for _single_s16_x4, _single_s32_x4 + // and _single_s64_x4 + svint8x4_t svqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; + ``` +#### SQDMULH (multi) - // Variants are also available for _s8_x4 - void svst1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt) - __arm_streaming; +Multi-vector signed saturating doubling multiply high +``` c + // Variants are also available for _s16_x2, _s32_x2 and _s64_x2 + svint8x2_t svqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; - // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - void svst1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt) - __arm_streaming; + // Variants are also available for _s16_x4, _s32_x4 and _s64_x4 + svint8x4_t svqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; + ``` - // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - void svst1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt) - __arm_streaming; +#### SUNPK, UUNPK +Multi-vector pack/unpack - // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - void svst1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt) - __arm_streaming; +``` c + // Variants are also available for _u16[_u8_x2], _u32[_u16_x2], _s32[_s16_x2], + // _u64[_u32_x2] and _s64[_s32_x2] + svint16x2_t svunpk_s16[_s8_x2](svint8_t zn) __arm_streaming; - // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - void svst1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt) - __arm_streaming; + // Variants are also available for _u16[_u8_x4], _u32[_u16_x4], _s32[_s16_x4], + // _u64[_u32_x4] and _s64[_s32_x4] + svint16x4_t svunpk_s16[_s8_x4](svint8x2_t zn) __arm_streaming; + ``` +#### ZIP - // Variants are also available for _s32_x2 and _f32_x2 - void svst1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt) - __arm_streaming; +Multi-vector zip. +``` c + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svzip[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _s32_x4 and _f32_x4 - void svst1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt) - __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svzip[_s8_x4](svint8x4_t zn) __arm_streaming; + ``` - // Variants are also available for _s32_x2 and _f32_x2 - void svst1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt) - __arm_streaming; +The `svzipq` intrinsics operate on quad-words, but for convenience accept all +element types. - // Variants are also available for _s32_x4 and _f32_x4 - void svst1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt) - __arm_streaming; +``` c + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svzipq[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _s64_x2 and _f64_x2 - void svst1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt) - __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svzipq[_s8_x4](svint8x4_t zn) __arm_streaming; + ``` +#### UZP - // Variants are also available for _s64_x4 and _f64_x4 - void svst1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt) - __arm_streaming; +Multi-vector unzip. +``` c + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svuzp[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _s64_x2 and _f64_x2 - void svst1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt) - __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svuzp[_s8_x4](svint8x4_t zn) __arm_streaming; + ``` - // Variants are also available for _s64_x4 and _f64_x4 - void svst1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt) - __arm_streaming; +The `svuzpq` intrinsics operate on quad-words, but for convenience accept all +element types. + +``` c + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svuzpq[_s8_x2](svint8x2_t zn) __arm_streaming; + + + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svuzpq[_s8_x4](svint8x4_t zn) __arm_streaming; ``` -#### STNT1B, STNT1D, STNT1H, STNT1W +### Streaming-compatible versions of standard routines -Contiguous non-temporal store of multi-vector operand +ACLE provides the following streaming-compatible functions, +with the same behavior as the standard C functions that they +are named after. All of the functions have external linkage. ``` c - // Variants are also available for _s8_x2 - void svstnt1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt) - __arm_streaming; + void *__arm_sc_memcpy(void *dest, const void *src, size_t n) + __arm_streaming_compatible; + + void *__arm_sc_memmove(void *dest, const void *src, size_t n) + __arm_streaming_compatible; + + void *__arm_sc_memset(void *s, int c, size_t n) + __arm_streaming_compatible; + + void *__arm_sc_memchr(void *s, int c, size_t n) + __arm_streaming_compatible; +``` + +### SVE2.1 and SME2 instruction intrinsics + +The functions in this section are defined by either the header file + [``](#arm_sve.h) or [``](#arm_sme.h) +when `__ARM_FEATURE_SVE2.1` or `__ARM_FEATURE_SME2` is defined, respectively. + +These intrinsics can only be called from non-streaming code if +`__ARM_FEATURE_SVE2p1` is defined. They can only be called from streaming code +if the appropriate SME feature macro is defined (see previous paragraph). +They can only be called from streaming-compatible code if they could be called +from both non-streaming code and streaming code + +Most function in this section are SME2 or SVE2.1. However some are available in +SME. For convinience the ones available in SME will be tagged in the function +with `[SME]`. + +#### UCLAMP, SCLAMP, FCLAMP + +Clamp to minimum/maximum vector. + +``` c + // Variants are also available for: + // _s8, _u8, _s16, _u16, _s32, _u32 [SME] + // _f32, _s64, _u64 and _f64 + svfloat16_t svclamp[_f16](svfloat16_t op, svfloat16_t min, svfloat16_t max); + ``` + +#### CNTP + +Set scalar to count from predicate-as-counter. ``vl`` is expected to be 2 or 4. + +``` c + // Variants are also available for _c16, _c32 and _c64 + uint64_t svcntp_c8(svcount_t pnn, uint64_t vl); + ``` + +#### UDOT, SDOT, FDOT (vectors) + +Multi-vector dot-product (2-way) + +``` c + // Variants are also available for _s32_s16 and _u32_u16 + svfloat32_t svdot[_f32_f16](svfloat32_t zda, svfloat16_t zn, + svfloat16_t zm); + ``` + +#### UDOT, SDOT, FDOT (indexed) + +Multi-vector dot-product (2-way) + +``` c + // Variants are also available for _s32_s16 and _u32_u16 + svfloat32_t svdot_lane[_f32_f16](svfloat32_t zda, svfloat16_t zn, + svfloat16_t zm, uint64_t imm_idx); + ``` + +#### LD1B, LD1D, LD1H, LD1W + +Contiguous load to multi-vector + +``` c + // Variants are also available for _s8 + svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn); + + + // Variants are also available for _s8 + svuint8x4_t svld1[_u8]_x4(svcount_t png, const uint8_t *rn); + + + // Variants are also available for _s8 + svuint8x2_t svld1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); + + + // Variants are also available for _s8 + svuint8x4_t svld1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); - // Variants are also available for _s8_x4 - void svstnt1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt) - __arm_streaming; + // Variants are also available for _s16, _f16 and _bf16 + svuint16x2_t svld1[_u16]_x2(svcount_t png, const uint16_t *rn); - // Variants are also available for _s8_x2 - void svstnt1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt) - __arm_streaming; + // Variants are also available for _s16, _f16 and _bf16 + svuint16x4_t svld1[_u16]_x4(svcount_t png, const uint16_t *rn); - // Variants are also available for _s8_x4 - void svstnt1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt) - __arm_streaming; + // Variants are also available for _s16, _f16 and _bf16 + svuint16x2_t svld1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); - // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - void svstnt1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt) - __arm_streaming; + // Variants are also available for _s16, _f16 and _bf16 + svuint16x4_t svld1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); - // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - void svstnt1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt) - __arm_streaming; + // Variants are also available for _s32 and _f32 + svuint32x2_t svld1[_u32]_x2(svcount_t png, const uint32_t *rn); - // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - void svstnt1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt) - __arm_streaming; + // Variants are also available for _s32 and _f32 + svuint32x4_t svld1[_u32]_x4(svcount_t png, const uint32_t *rn); - // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - void svstnt1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt) - __arm_streaming; + // Variants are also available for _s32 and _f32 + svuint32x2_t svld1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); - // Variants are also available for _s32_x2 and _f32_x2 - void svstnt1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt) - __arm_streaming; + // Variants are also available for _s32 and _f32 + svuint32x4_t svld1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); - // Variants are also available for _s32_x4 and _f32_x4 - void svstnt1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt) - __arm_streaming; + // Variants are also available for _s64 and _f64 + svuint64x2_t svld1[_u64]_x2(svcount_t png, const uint64_t *rn); - // Variants are also available for _s32_x2 and _f32_x2 - void svstnt1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt) - __arm_streaming; + // Variants are also available for _s64 and _f64 + svuint64x4_t svld1[_u64]_x4(svcount_t png, const uint64_t *rn); - // Variants are also available for _s32_x4 and _f32_x4 - void svstnt1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt) - __arm_streaming; + // Variants are also available for _s64 and _f64 + svuint64x2_t svld1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); - // Variants are also available for _s64_x2 and _f64_x2 - void svstnt1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt) - __arm_streaming; + // Variants are also available for _s64 and _f64 + svuint64x4_t svld1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); + ``` +#### LDNT1B, LDNT1D, LDNT1H, LDNT1W - // Variants are also available for _s64_x4 and _f64_x4 - void svstnt1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt) - __arm_streaming; +Contiguous non-temporal load to multi-vector +``` c + // Variants are also available for _s8 + svuint8x2_t svldnt1[_u8]_x2(svcount_t png, const uint8_t *rn); - // Variants are also available for _s64_x2 and _f64_x2 - void svstnt1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt) - __arm_streaming; + // Variants are also available for _s8 + svuint8x4_t svldnt1[_u8]_x4(svcount_t png, const uint8_t *rn); - // Variants are also available for _s64_x4 and _f64_x4 - void svstnt1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt) - __arm_streaming; - ``` -#### LDR, STR + // Variants are also available for _s8 + svuint8x2_t svldnt1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); -Spill and fill of ZT0 -``` c - void svldr_zt(uint64_t zt, const void *rn) - __arm_streaming_compatible __arm_inout("zt0"); + // Variants are also available for _s8 + svuint8x4_t svldnt1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); - void svstr_zt(uint64_t zt, void *rn) - __arm_streaming_compatible __arm_in("zt0"); - ``` + // Variants are also available for _s16, _f16 and _bf16 + svuint16x2_t svldnt1[_u16]_x2(svcount_t png, const uint16_t *rn); -#### ZERO -Zero ZT0 + // Variants are also available for _s16, _f16 and _bf16 + svuint16x4_t svldnt1[_u16]_x4(svcount_t png, const uint16_t *rn); -``` c - void svzero_zt(uint64_t zt) - __arm_streaming_compatible __arm_out("zt0"); - ``` -#### LUTI2, LUTI4 + // Variants are also available for _s16, _f16 and _bf16 + svuint16x2_t svldnt1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); -Lookup table read with 2-bit and 4-bit indexes -``` c - // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, - // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 - svint8_t svluti2_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); + // Variants are also available for _s16, _f16 and _bf16 + svuint16x4_t svldnt1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); - // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, - // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 - svint8x2_t svluti2_lane_zt_s8_x2(uint64_t zt, svuint8_t zn, - uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); + // Variants are also available for _s32 and _f32 + svuint32x2_t svldnt1[_u32]_x2(svcount_t png, const uint32_t *rn); - // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, - // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 - svint8x4_t svluti2_lane_zt_s8_x4(uint64_t zt, svuint8_t zn, - uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); + // Variants are also available for _s32 and _f32 + svuint32x4_t svldnt1[_u32]_x4(svcount_t png, const uint32_t *rn); - // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, - // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 - svint8_t svluti4_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); + // Variants are also available for _s32 and _f32 + svuint32x2_t svldnt1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); - // Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16, - // _zt_bf16, _zt_s32, _zt_u32 and _zt_f32 - svint8x2_t svluti4_lane_zt_s8_x2(uint64_t zt, svuint8_t zn, - uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); + // Variants are also available for _s32 and _f32 + svuint32x4_t svldnt1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); - // Variants are also available for _zt_u16, _zt_f16, _zt_bf16, _zt_s32, - // _zt_u32 and _zt_f32 - svint16x4_t svluti4_lane_zt_s16_x4(uint64_t zt, svuint8_t zn, - uint64_t imm_idx) - __arm_streaming __arm_in("zt0"); - ``` + // Variants are also available for _s64 and _f64 + svuint64x2_t svldnt1[_u64]_x2(svcount_t png, const uint64_t *rn); -#### MOVA -Move multi-vectors to/from ZA + // Variants are also available for _s64 and _f64 + svuint64x4_t svldnt1[_u64]_x4(svcount_t png, const uint64_t *rn); -``` c - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) - __arm_streaming __arm_in("za"); + // Variants are also available for _s64 and _f64 + svuint64x2_t svldnt1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) - __arm_streaming __arm_in("za"); + // Variants are also available for _s64 and _f64 + svuint64x4_t svldnt1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); + ``` - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) - __arm_streaming __arm_in("za"); +#### BFMLSLB, BFMLSLT +BFloat16 floating-point multiply-subtract long from single-precision (top/bottom) - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) - __arm_streaming __arm_in("za"); +``` c + svfloat32_t svbfmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x2_t svread_za8_s8_vg1x2(uint32_t slice) - __arm_streaming __arm_in("za"); + svfloat32_t svbfmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); - // Variants are also available for _za8_u8, _za16_s16, _za16_u16, - // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, - // _za64_s64, _za64_u64 and _za64_f64 - svint8x4_t svread_za8_s8_vg1x4(uint32_t slice) - __arm_streaming __arm_in("za"); + svfloat32_t svbfmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) - __arm_streaming __arm_inout("za"); + svfloat32_t svbfmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + ``` +#### PEXT - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) - __arm_streaming __arm_inout("za"); +Transform a predicate-as-counter to a predicate (pair). +``` c + // Variants are also available for _c16, _c32 and _c64 + svbool_t svpext_lane_c8(svcount_t pnn, uint64_t imm); - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) - __arm_streaming __arm_inout("za"); + // Variants are also available for _c16, _c32 and _c64 + svboolx2_t svpext_lane_c8_x2(svcount_t pnn, uint64_t imm); + ``` - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) - __arm_streaming __arm_inout("za"); +#### PSEL +Predicate select between predicate value or all-false - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn) - __arm_streaming __arm_inout("za"); +``` c + // Variants are also available for _b16, _b32 and _b64 [SME] + svbool_t svpsel_lane_b8(svbool_t pn, svbool_t pm, uint32_t idx); - // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], - // _za64[_s64], _za64[_u64] and _za64[_f64] - void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn) - __arm_streaming __arm_inout("za"); + // Variants are also available for _c16, _c32 and _c64 + svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx); ``` -#### PTRUE +#### PTRUE, PFALSE Initialise predicate-as-counter to all active or all inactive. ``` c // Variants are also available for _c16, _c32 and _c64 - svcount_t svptrue_c8() __arm_streaming; + svcount_t svptrue_c8(); - svcount_t svpfalse_c(void) __arm_streaming_compatible; + svcount_t svpfalse_c(void); ``` +#### REVD -#### PEXT - -Transform a predicate-as-counter to a predicate (pair). +Reverse doublewords in elements. ``` c - // Variants are also available for _c16, _c32 and _c64 - svbool_t svpext_lane_c8(svcount_t pnn, uint64_t imm) __arm_streaming; + // All the intrinsics below are [SME] + // Variants are available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + // _bf16, _f16, _f32, _f64 + svuint8_t svrevd[_u8]_m(svuint8_t zd, svbool_t pg, svuint8_t zn); - // Variants are also available for _c16, _c32 and _c64 - svboolx2_t svpext_lane_c8_x2(svcount_t pnn, uint64_t imm) __arm_streaming; + // Variants are available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + // _bf16, _f16, _f32, _f64 + svuint8_t svrevd[_u8]_z(svbool_t pg, svuint8_t zn); + + + // Variants are available for: + // _s8, _s16, _u16, _s32, _u32, _s64, _u64 + // _bf16, _f16, _f32, _f64 + svuint8_t svrevd[_u8]_x(svbool_t pg, svuint8_t zn); ``` -#### PSEL +#### SQCVTN, SQCVTUN, UQCVTN -Predicate select between predicate value or all-false +Multi-vector saturating extract narrow and interleave ``` c - // Variants are also available for _c16, _c32 and _c64 - svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx) - __arm_streaming_compatible; + // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] + svint16_t svqcvtn_s16[_s32_x2](svint32x2_t zn); ``` -#### CNTP +#### SQRSHRN, UQRSHRN -Set scalar to count from predicate-as-counter. ``vl`` is expected to be 2 or 4. +Multi-vector saturating rounding shift right narrow and interleave ``` c - // Variants are also available for _c16, _c32 and _c64 - uint64_t svcntp_c8(svcount_t pnn, uint64_t vl) __arm_streaming; + // Variants are also available for _u16[_u32_x2] + svint16_t svqrshrn[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm); ``` -#### UCLAMP, SCLAMP, FCLAMP +#### SQRSHRUN -Multi-vector clamp to minimum/maximum vector +Multi-vector saturating rounding shift right unsigned narrow and interleave ``` c - // Variants are also available for _s8, _u8, _s16, _u16, _s32, _u32, _f32, - // _s64, _u64 and _f64 - svfloat16_t svclamp[_f16](svfloat16_t zd, svfloat16_t zn, svfloat16_t zm) - __arm_streaming_compatible; + svuint16_t svqrshrun[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm); + ``` +#### ST1B, ST1D, ST1H, ST1W - // Variants are also available for _single_s8_x2, _single_u8_x2, - // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, - // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 - svfloat16x2_t svclamp[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zn, - svfloat16_t zm) - __arm_streaming; +Contiguous store of multi-vector operand + +``` c + // Variants are also available for _s8_x2 + void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); + + + // Variants are also available for _s8_x4 + void svst1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); + + + // Variants are also available for _s8_x2 + void svst1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); - // Variants are also available for _single_s8_x4, _single_u8_x4, - // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, - // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 - svfloat16x4_t svclamp[_single_f16_x4](svfloat16x4_t zd, svfloat16_t zn, - svfloat16_t zm) - __arm_streaming; - ``` + // Variants are also available for _s8_x4 + void svst1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); -#### SEL + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 + void svst1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); -Multi-vector conditionally select elements from two vectors -``` c - // Variants are also available for _s8_x2, _u16_x2, _s16_x2, _f16_x2, - // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm) - __arm_streaming; + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 + void svst1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - // Variants are also available for _s8_x4, _u16_x4, _s16_x4, _f16_x4, - // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm) - __arm_streaming; - ``` + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 + void svst1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); -#### URSHL, SRSHL (single) -Multi-vector rounding shift left + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 + void svst1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); -``` c - // Variants are also available for _single_u8_x2, _single_u16_x2, - // _single_s16_x2, _single_u32_x2, _single_s32_x2, _single_u64_x2 - // and _single_s64_x2 - svint8x2_t svrshl[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; + // Variants are also available for _s32_x2 and _f32_x2 + void svst1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); - // Variants are also available for _single_u8_x4, _single_u16_x4, - // _single_s16_x4, _single_u32_x4, _single_s32_x4, _single_u64_x4 - // and _single_s64_x4 - svint8x4_t svrshl[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; - ``` -#### URSHL, SRSHL (multi) + // Variants are also available for _s32_x4 and _f32_x4 + void svst1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); -Multi-vector rounding shift left -``` c - // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _u32_x2, _s32_x2, - // _u64_x2 and _s64_x2 - svint8x2_t svrshl[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; + // Variants are also available for _s32_x2 and _f32_x2 + void svst1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); - // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _u32_x4, _s32_x4, - // _u64_x4 and _s64_x4 - svint8x4_t svrshl[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; - ``` + // Variants are also available for _s32_x4 and _f32_x4 + void svst1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); -#### SQRSHR, UQRSHR -Multi-vector saturating rounding shift right narrow + // Variants are also available for _s64_x2 and _f64_x2 + void svst1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); -``` c - // Variants are also available for _u8[_u32_x4] - svint8_t svqrshr[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm) - __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 + void svst1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - // Variants are also available for _u16[_u32_x2] - svint16_t svqrshr[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm) - __arm_streaming; + // Variants are also available for _s64_x2 and _f64_x2 + void svst1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); - // Variants are also available for _u16[_u64_x4] - svint16_t svqrshr[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm) - __arm_streaming; + // Variants are also available for _s64_x4 and _f64_x4 + void svst1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); ``` -#### SQRSHRN, UQRSHRN +#### STNT1B, STNT1D, STNT1H, STNT1W -Multi-vector saturating rounding shift right narrow and interleave +Contiguous non-temporal store of multi-vector operand ``` c - // Variants are also available for _u8[_u32_x4] - svint8_t svqrshrn[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm) - __arm_streaming; + // Variants are also available for _s8_x2 + void svstnt1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); - // Variants are also available for _u16[_u32_x2] - svint16_t svqrshrn[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm) - __arm_streaming_compatible; + // Variants are also available for _s8_x4 + void svstnt1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); - // Variants are also available for _u16[_u64_x4] - svint16_t svqrshrn[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm) - __arm_streaming; - ``` + // Variants are also available for _s8_x2 + void svstnt1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); -#### SQRSHRU -Multi-vector saturating rounding shift right unsigned narrow + // Variants are also available for _s8_x4 + void svstnt1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); -``` c - svuint8_t svqrshru[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm) - __arm_streaming; + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 + void svstnt1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); - svuint16_t svqrshru[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm) - __arm_streaming; + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 + void svstnt1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - svuint16_t svqrshru[_n]_u16[_s64_x4](svint64x4_t zn, uint64_t imm) - __arm_streaming; - ``` -#### SQRSHRUN + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 + void svstnt1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); -Multi-vector saturating rounding shift right unsigned narrow and interleave -``` c - svuint16_t svqrshrun[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm) - __arm_streaming_compatible; + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 + void svstnt1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); - // Variants are also available for _u16[_s64_x4] - svuint8_t svqrshrun[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm) - __arm_streaming; - ``` + // Variants are also available for _s32_x2 and _f32_x2 + void svstnt1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); -#### SQDMULH (single) -Multi-vector signed saturating doubling multiply high + // Variants are also available for _s32_x4 and _f32_x4 + void svstnt1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); -``` c - // Variants are also available for _single_s16_x2, _single_s32_x2 - // and _single_s64_x2 - svint8x2_t svqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm) - __arm_streaming; + + // Variants are also available for _s32_x2 and _f32_x2 + void svstnt1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); - // Variants are also available for _single_s16_x4, _single_s32_x4 - // and _single_s64_x4 - svint8x4_t svqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm) - __arm_streaming; - ``` + // Variants are also available for _s32_x4 and _f32_x4 + void svstnt1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); -#### SQDMULH (multi) -Multi-vector signed saturating doubling multiply high + // Variants are also available for _s64_x2 and _f64_x2 + void svstnt1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); -``` c - // Variants are also available for _s16_x2, _s32_x2 and _s64_x2 - svint8x2_t svqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 + void svstnt1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - // Variants are also available for _s16_x4, _s32_x4 and _s64_x4 - svint8x4_t svqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; + // Variants are also available for _s64_x2 and _f64_x2 + void svstnt1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); + + + // Variants are also available for _s64_x4 and _f64_x4 + void svstnt1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); ``` #### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT @@ -11835,26 +12315,22 @@ While (resulting in predicate-as-counter). ``vl`` is expected to be 2 or 4. ``` c // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] - svcount_t svwhilege_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) - __arm_streaming; + svcount_t svwhilege_c8[_s64](int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] - svcount_t svwhilegt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) - __arm_streaming; + svcount_t svwhilegt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] - svcount_t svwhilele_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) - __arm_streaming; + svcount_t svwhilele_c8[_s64](int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] - svcount_t svwhilelt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) - __arm_streaming; + svcount_t svwhilelt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl); ``` While (resulting in predicate tuple) @@ -11863,126 +12339,27 @@ While (resulting in predicate tuple) // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and // _b64[_u64]_x2 - svboolx2_t svwhilege_b8[_s64]_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; + svboolx2_t svwhilege_b8[_s64]_x2(int64_t rn, int64_t rm); // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and // _b64[_u64]_x2 - svboolx2_t svwhilegt_b8[_s64]_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; + svboolx2_t svwhilegt_b8[_s64]_x2(int64_t rn, int64_t rm); // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and // _b64[_u64]_x2 - svboolx2_t svwhilele_b8[_s64]_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; + svboolx2_t svwhilele_b8[_s64]_x2(int64_t rn, int64_t rm); // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and // _b64[_u64]_x2 - svboolx2_t svwhilelt_b8[_s64]_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; - ``` - -#### SUNPK, UUNPK - -Multi-vector pack/unpack - -``` c - // Variants are also available for _u16[_u8_x2], _u32[_u16_x2], _s32[_s16_x2], - // _u64[_u32_x2] and _s64[_s32_x2] - svint16x2_t svunpk_s16[_s8_x2](svint8_t zn) __arm_streaming; - - - // Variants are also available for _u16[_u8_x4], _u32[_u16_x4], _s32[_s16_x4], - // _u64[_u32_x4] and _s64[_s32_x4] - svint16x4_t svunpk_s16[_s8_x4](svint8x2_t zn) __arm_streaming; - ``` - -#### ZIP - -Multi-vector zip. - -``` c - // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, - // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - svint8x2_t svzip[_s8_x2](svint8x2_t zn) __arm_streaming; - - - // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, - // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - svint8x4_t svzip[_s8_x4](svint8x4_t zn) __arm_streaming; - ``` - -The `svzipq` intrinsics operate on quad-words, but for convenience accept all -element types. - - -``` c - // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, - // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - svint8x2_t svzipq[_s8_x2](svint8x2_t zn) __arm_streaming; - - - // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, - // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - svint8x4_t svzipq[_s8_x4](svint8x4_t zn) __arm_streaming; - ``` - -#### UZP - -Multi-vector unzip. - -``` c - // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, - // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - svint8x2_t svuzp[_s8_x2](svint8x2_t zn) __arm_streaming; - - - // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, - // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - svint8x4_t svuzp[_s8_x4](svint8x4_t zn) __arm_streaming; - ``` - -The `svuzpq` intrinsics operate on quad-words, but for convenience accept all -element types. - -``` c - // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, - // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - svint8x2_t svuzpq[_s8_x2](svint8x2_t zn) __arm_streaming; - - - // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, - // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - svint8x4_t svuzpq[_s8_x4](svint8x4_t zn) __arm_streaming; + svboolx2_t svwhilelt_b8[_s64]_x2(int64_t rn, int64_t rm); ``` -### Streaming-compatible versions of standard routines - -ACLE provides the following streaming-compatible functions, -with the same behavior as the standard C functions that they -are named after. All of the functions have external linkage. - -``` c - void *__arm_sc_memcpy(void *dest, const void *src, size_t n) - __arm_streaming_compatible; - - void *__arm_sc_memmove(void *dest, const void *src, size_t n) - __arm_streaming_compatible; - - void *__arm_sc_memset(void *s, int c, size_t n) - __arm_streaming_compatible; - - void *__arm_sc_memchr(void *s, int c, size_t n) - __arm_streaming_compatible; -``` - - # M-profile Vector Extension (MVE) intrinsics The M-profile Vector Extension (MVE) [[MVE-spec]](#MVE-spec) instructions provide packed Single