From 3fa6fa2f58b39a7bf83ee96df1832aec9691798f Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Thu, 12 Dec 2024 14:15:48 -0800 Subject: [PATCH] pulley: Get `simd_boolean.wast` test passing Fill out some bitmask/test instructions for vectors. --- .../codegen/src/isa/pulley_shared/lower.isle | 29 +++++ .../filetests/runtests/simd-valltrue.clif | 4 + .../filetests/runtests/simd-vanytrue.clif | 4 + .../filetests/runtests/simd-vhighbits.clif | 4 + crates/wast-util/src/lib.rs | 3 - pulley/src/interp.rs | 100 ++++++++++++++++++ pulley/src/lib.rs | 28 +++++ 7 files changed, 169 insertions(+), 3 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle index f8fbaac542e1..fd849c358f3b 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.isle +++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle @@ -843,3 +843,32 @@ (rule (lower (has_type $I64X2 (splat a))) (pulley_vsplatx64 a)) (rule (lower (has_type $F32X4 (splat a))) (pulley_vsplatf32 a)) (rule (lower (has_type $F64X2 (splat a))) (pulley_vsplatf64 a)) + +;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (fits_in_32 _) (vhigh_bits a @ (value_type $I8X16)))) + (pulley_vbitmask8x16 a)) +(rule (lower (has_type (fits_in_32 _) (vhigh_bits a @ (value_type $I16X8)))) + (pulley_vbitmask16x8 a)) +(rule (lower (has_type (fits_in_32 _) (vhigh_bits a @ (value_type $I32X4)))) + (pulley_vbitmask32x4 a)) +(rule (lower (has_type (fits_in_32 _) (vhigh_bits a @ (value_type $I64X2)))) + (pulley_vbitmask64x2 a)) + +;;;; Rules for `vall_true`; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vall_true a @ (value_type $I8X16))) (pulley_valltrue8x16 a)) +(rule (lower (vall_true a @ (value_type $I16X8))) (pulley_valltrue16x8 a)) +(rule (lower (vall_true a @ (value_type $I32X4))) (pulley_valltrue32x4 a)) +(rule (lower (vall_true a @ (value_type $I64X2))) (pulley_valltrue64x2 a)) +(rule (lower (vall_true a @ (value_type $F32X4))) (pulley_valltrue32x4 a)) +(rule (lower (vall_true a @ (value_type $F64X2))) (pulley_valltrue64x2 a)) + +;;;; Rules for `vany_true`; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (vany_true a @ (value_type $I8X16))) (pulley_vanytrue8x16 a)) +(rule (lower (vany_true a @ (value_type $I16X8))) (pulley_vanytrue16x8 a)) +(rule (lower (vany_true a @ (value_type $I32X4))) (pulley_vanytrue32x4 a)) +(rule (lower (vany_true a @ (value_type $I64X2))) (pulley_vanytrue64x2 a)) +(rule (lower (vany_true a @ (value_type $F32X4))) (pulley_vanytrue32x4 a)) +(rule (lower (vany_true a @ (value_type $F64X2))) (pulley_vanytrue64x2 a)) diff --git a/cranelift/filetests/filetests/runtests/simd-valltrue.clif b/cranelift/filetests/filetests/runtests/simd-valltrue.clif index 60e947b56e12..baec8fcf9f7e 100644 --- a/cranelift/filetests/filetests/runtests/simd-valltrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-valltrue.clif @@ -9,6 +9,10 @@ target x86_64 sse42 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %vall_true_i8x16(i8x16) -> i8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif index a1eb39b8bf9c..0f7a20878a49 100644 --- a/cranelift/filetests/filetests/runtests/simd-vanytrue.clif +++ b/cranelift/filetests/filetests/runtests/simd-vanytrue.clif @@ -8,6 +8,10 @@ target x86_64 sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %vany_true_i8x16(i8x16) -> i8 { block0(v0: i8x16): diff --git a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif index 1defc79bae3f..4ecc4e52b4c7 100644 --- a/cranelift/filetests/filetests/runtests/simd-vhighbits.clif +++ b/cranelift/filetests/filetests/runtests/simd-vhighbits.clif @@ -7,6 +7,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx set enable_multi_ret_implicit_sret target riscv64 has_v target riscv64 has_v has_c has_zcb +target pulley32 +target pulley32be +target pulley64 +target pulley64be function %vhighbits_i8x16(i8x16) -> i16 { block0(v0: i8x16): diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index d510585f5479..01be1a00c81a 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -405,10 +405,8 @@ impl WastTest { "misc_testsuite/simd/almost-extmul.wast", "misc_testsuite/simd/canonicalize-nan.wast", "misc_testsuite/simd/cvt-from-uint.wast", - "misc_testsuite/simd/issue4807.wast", "misc_testsuite/simd/issue6725-no-egraph-panic.wast", "misc_testsuite/simd/issue_3327_bnot_lowering.wast", - "misc_testsuite/simd/load_splat_out_of_bounds.wast", "misc_testsuite/simd/replace-lane-preserve.wast", "misc_testsuite/simd/spillslot-size-fuzzbug.wast", "misc_testsuite/simd/v128-select.wast", @@ -430,7 +428,6 @@ impl WastTest { "spec_testsuite/proposals/memory64/i16x8_relaxed_q15mulr_s.wast", "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast", "spec_testsuite/proposals/memory64/i8x16_relaxed_swizzle.wast", - "spec_testsuite/simd_boolean.wast", "spec_testsuite/simd_conversions.wast", "spec_testsuite/simd_f32x4.wast", "spec_testsuite/simd_f32x4_arith.wast", diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 536b0d45320f..694a47b65efe 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -2825,6 +2825,106 @@ impl OpVisitor for Interpreter<'_> { self.state[dst].set_u128((c & x) | (!c & y)); ControlFlow::Continue(()) } + + fn vbitmask8x16(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u8x16(); + let mut result = 0; + for item in a.iter().rev() { + result <<= 1; + result |= (*item >> 7) as u32; + } + self.state[dst].set_u32(result); + ControlFlow::Continue(()) + } + + fn vbitmask16x8(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u16x8(); + let mut result = 0; + for item in a.iter().rev() { + result <<= 1; + result |= (*item >> 15) as u32; + } + self.state[dst].set_u32(result); + ControlFlow::Continue(()) + } + + fn vbitmask32x4(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u32x4(); + let mut result = 0; + for item in a.iter().rev() { + result <<= 1; + result |= *item >> 31; + } + self.state[dst].set_u32(result); + ControlFlow::Continue(()) + } + + fn vbitmask64x2(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u64x2(); + let mut result = 0; + for item in a.iter().rev() { + result <<= 1; + result |= (*item >> 63) as u32; + } + self.state[dst].set_u32(result); + ControlFlow::Continue(()) + } + + fn valltrue8x16(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u8x16(); + let result = a.iter().all(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn valltrue16x8(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u16x8(); + let result = a.iter().all(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn valltrue32x4(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u32x4(); + let result = a.iter().all(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn valltrue64x2(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u64x2(); + let result = a.iter().all(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn vanytrue8x16(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u8x16(); + let result = a.iter().any(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn vanytrue16x8(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u16x8(); + let result = a.iter().any(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn vanytrue32x4(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u32x4(); + let result = a.iter().any(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } + + fn vanytrue64x2(&mut self, dst: XReg, src: VReg) -> ControlFlow { + let a = self.state[src].get_u64x2(); + let result = a.iter().any(|a| *a != 0); + self.state[dst].set_u32(u32::from(result)); + ControlFlow::Continue(()) + } } impl ExtendedOpVisitor for Interpreter<'_> { diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 04567c931786..baa4c813ca26 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -655,6 +655,34 @@ macro_rules! for_each_op { vbnot128 = VBnot128 { dst: VReg, src: VReg }; /// `dst = (c & x) | (!c & y)` vbitselect128 = VBitselect128 { dst: VReg, c: VReg, x: VReg, y: VReg }; + /// Collect high bits of each lane into the low 32-bits of the + /// destination. + vbitmask8x16 = Vbitmask8x16 { dst: XReg, src: VReg }; + /// Collect high bits of each lane into the low 32-bits of the + /// destination. + vbitmask16x8 = Vbitmask16x8 { dst: XReg, src: VReg }; + /// Collect high bits of each lane into the low 32-bits of the + /// destination. + vbitmask32x4 = Vbitmask32x4 { dst: XReg, src: VReg }; + /// Collect high bits of each lane into the low 32-bits of the + /// destination. + vbitmask64x2 = Vbitmask64x2 { dst: XReg, src: VReg }; + /// Store whether all lanes are nonzero in `dst`. + valltrue8x16 = Valltrue8x16 { dst: XReg, src: VReg }; + /// Store whether all lanes are nonzero in `dst`. + valltrue16x8 = Valltrue16x8 { dst: XReg, src: VReg }; + /// Store whether all lanes are nonzero in `dst`. + valltrue32x4 = Valltrue32x4 { dst: XReg, src: VReg }; + /// Store whether any lanes are nonzero in `dst`. + valltrue64x2 = Valltrue64x2 { dst: XReg, src: VReg }; + /// Store whether any lanes are nonzero in `dst`. + vanytrue8x16 = Vanytrue8x16 { dst: XReg, src: VReg }; + /// Store whether any lanes are nonzero in `dst`. + vanytrue16x8 = Vanytrue16x8 { dst: XReg, src: VReg }; + /// Store whether any lanes are nonzero in `dst`. + vanytrue32x4 = Vanytrue32x4 { dst: XReg, src: VReg }; + /// Store whether any lanes are nonzero in `dst`. + vanytrue64x2 = Vanytrue64x2 { dst: XReg, src: VReg }; } }; }