From fc3c868b13bccf5d7ed26de4cfd1a2dc4c689360 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 6 Jan 2025 12:34:12 -0600
Subject: [PATCH] pulley: Fill out most remaining simd float ops (#9884)

* pulley: Fill out most remaining simd float ops

Get most simd/float-related tests passing. Mostly reusing preexisting
scalar ops for the simd implementation.

* Fix fma test on MinGW

prtest:full

* More MinGW fixes
---
 .../codegen/src/isa/pulley_shared/lower.isle  |  38 +++++-
 .../filetests/runtests/simd-fadd-splat.clif   |   4 +
 .../filetests/runtests/simd-fadd.clif         |   4 +
 .../filetests/runtests/simd-fcmp-eq.clif      |   4 +
 .../filetests/runtests/simd-fcmp-ge.clif      |   4 +
 .../filetests/runtests/simd-fcmp-gt.clif      |   4 +
 .../filetests/runtests/simd-fcmp-le.clif      |   4 +
 .../filetests/runtests/simd-fcmp-lt.clif      |   4 +
 .../filetests/runtests/simd-fcmp-ne.clif      |   4 +
 .../filetests/runtests/simd-fcmp-uno.clif     |   4 +
 .../filetests/runtests/simd-fdiv.clif         |   4 +
 .../filetests/runtests/simd-floor.clif        |   4 +
 .../filetests/runtests/simd-fma-neg.clif      |   4 +
 .../filetests/runtests/simd-fma.clif          |   4 +
 .../runtests/simd-fmin-max-pseudo.clif        |   4 +
 .../filetests/runtests/simd-fmul.clif         |   4 +
 .../filetests/runtests/simd-fneg.clif         |   4 +
 .../filetests/runtests/simd-fsub.clif         |   4 +
 crates/math/src/lib.rs                        |  14 ++-
 crates/wasmtime/src/runtime/vm/libcalls.rs    |   4 +-
 crates/wast-util/src/lib.rs                   |   8 --
 pulley/src/interp.rs                          | 110 ++++++++++++++++++
 pulley/src/lib.rs                             |  22 ++++
 23 files changed, 247 insertions(+), 17 deletions(-)

diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index 72ee094e0b1e..fe19e20bc41c 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -756,6 +756,9 @@
 (rule (lower (fcmp cc a b @ (value_type (ty_scalar_float ty))))
   (lower_fcmp ty cc a b))
 
+(rule 1 (lower (fcmp cc a b @ (value_type (ty_vec128 ty))))
+  (lower_vfcmp ty cc a b))
+
 (decl lower_fcmp (Type FloatCC Value Value) XReg)
 
 (rule (lower_fcmp $F32 (FloatCC.Equal) a b) (pulley_feq32 a b))
@@ -787,6 +790,32 @@
   (if-let true (floatcc_unordered cc))
   (pulley_xbxor32_s8 (lower_fcmp ty (floatcc_complement cc) a b) 1))
 
+(decl lower_vfcmp (Type FloatCC Value Value) VReg)
+
+(rule (lower_vfcmp $F32X4 (FloatCC.Equal) a b) (pulley_veqf32x4 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.Equal) a b) (pulley_veqf64x2 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.NotEqual) a b) (pulley_vneqf32x4 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.NotEqual) a b) (pulley_vneqf64x2 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.LessThan) a b) (pulley_vltf32x4 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.LessThan) a b) (pulley_vltf64x2 a b))
+(rule (lower_vfcmp $F32X4 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf32x4 a b))
+(rule (lower_vfcmp $F64X2 (FloatCC.LessThanOrEqual) a b) (pulley_vlteqf64x2 a b))
+
+(rule (lower_vfcmp ty (FloatCC.Unordered) a b)
+  (pulley_vbor128
+    (lower_vfcmp ty (FloatCC.NotEqual) a a)
+    (lower_vfcmp ty (FloatCC.NotEqual) b b)))
+
+;; NB: Pulley doesn't have lowerings for `Ordered` or `Unordered*` `FloatCC`
+;; conditions as that's not needed by wasm at this time.
+
+;; Pulley doesn't have instructions for `>` and `>=`, so we have to reverse the
+;; operation.
+(rule (lower_vfcmp ty (FloatCC.GreaterThan) a b)
+  (lower_vfcmp ty (FloatCC.LessThan) b a))
+(rule (lower_vfcmp ty (FloatCC.GreaterThanOrEqual) a b)
+  (lower_vfcmp ty (FloatCC.LessThanOrEqual) b a))
+
 ;;;; Rules for `load` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl amode (Value Offset32) Amode)
@@ -1203,6 +1232,7 @@
       (pulley_vfloor32x4 a))
 (rule (lower (has_type $F64X2 (floor a)))
       (pulley_vfloor64x2 a))
+
 ;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $F32 (ceil a))) (pulley_fceil32 a))
@@ -1230,7 +1260,6 @@
 (rule (lower (has_type $F64X2 (sqrt a)))
       (pulley_vsqrt64x2 a))
 
-
 ;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $F32 (fneg a))) (pulley_fneg32 a))
@@ -1407,7 +1436,7 @@
 (rule (lower (scalar_to_vector a @ (value_type $F64)))
   (pulley_vinsertf64 (pulley_vconst128 0) a 0))
 
-;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8X16 (shuffle a b (u128_from_immediate mask))))
   (pulley_vshuffle a b mask))
@@ -1415,3 +1444,8 @@
 ;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 1 (lower (has_type $I8X16 (swizzle a b))) (pulley_vswizzlei8x16 a b))
+
+;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type $F32X4 (fma a b c))) (pulley_vfma32x4 a b c))
+(rule (lower (has_type $F64X2 (fma a b c))) (pulley_vfma64x2 a b c))
diff --git a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif
index e018fd0fd7a8..c8d3035093ac 100644
--- a/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fadd-splat.clif
@@ -8,6 +8,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %splat_f32x4_2(f32x4) -> f32x4 {
 block0(v0: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fadd.clif b/cranelift/filetests/filetests/runtests/simd-fadd.clif
index 402ee9e44f08..6ca2c6c5779e 100644
--- a/cranelift/filetests/filetests/runtests/simd-fadd.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fadd.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %fadd_f32x4(f32x4, f32x4) -> f32x4 {
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif
index 378b5f273069..31c86ca33f35 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-eq.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_eq_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif
index b9addbfaadaf..8e7c0e3354bb 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-ge.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_ge_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif
index 25bf525ddda8..947feca07239 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-gt.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_gt_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif
index e1ec0e911c25..9e498c42518f 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-le.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_le_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif
index 0a3fd948825f..0a5c22fc1755 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-lt.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_lt_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif
index 7920996d0357..f9fd58bf54fd 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-ne.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_ne_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif b/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif
index 38886bf1bb09..0fb43c749056 100644
--- a/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fcmp-uno.clif
@@ -6,6 +6,10 @@ target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %simd_fcmp_uno_f32(f32x4, f32x4) -> i32x4 {
 block0(v0: f32x4, v1: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fdiv.clif b/cranelift/filetests/filetests/runtests/simd-fdiv.clif
index b6707dc86be3..d491e86ab98c 100644
--- a/cranelift/filetests/filetests/runtests/simd-fdiv.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fdiv.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %fdiv_f32x4(f32x4, f32x4) -> f32x4 {
diff --git a/cranelift/filetests/filetests/runtests/simd-floor.clif b/cranelift/filetests/filetests/runtests/simd-floor.clif
index c6e59c9888c8..494cd229d07e 100644
--- a/cranelift/filetests/filetests/runtests/simd-floor.clif
+++ b/cranelift/filetests/filetests/runtests/simd-floor.clif
@@ -9,6 +9,10 @@ target s390x
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %floor_f32x4(f32x4) -> f32x4 {
 block0(v0: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fma-neg.clif b/cranelift/filetests/filetests/runtests/simd-fma-neg.clif
index 1351ef34d091..cd3caabb72dd 100644
--- a/cranelift/filetests/filetests/runtests/simd-fma-neg.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fma-neg.clif
@@ -5,6 +5,10 @@ target aarch64
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 ;; This file is not enabled in the interpreter since SIMD fneg is currently broken
 ;; there.
diff --git a/cranelift/filetests/filetests/runtests/simd-fma.clif b/cranelift/filetests/filetests/runtests/simd-fma.clif
index c3f143987047..91e9c270223b 100644
--- a/cranelift/filetests/filetests/runtests/simd-fma.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fma.clif
@@ -6,6 +6,10 @@ target aarch64
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fma_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4, v2: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif
index f8d537301f29..9fd071e5e4dd 100644
--- a/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fmin-max-pseudo.clif
@@ -6,6 +6,10 @@ target x86_64 skylake
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fmin_pseudo_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0:f32x4, v1:f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fmul.clif b/cranelift/filetests/filetests/runtests/simd-fmul.clif
index 9febf85eead8..cca72e1beda7 100644
--- a/cranelift/filetests/filetests/runtests/simd-fmul.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fmul.clif
@@ -8,6 +8,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %fmul_f32x4(f32x4, f32x4) -> f32x4 {
diff --git a/cranelift/filetests/filetests/runtests/simd-fneg.clif b/cranelift/filetests/filetests/runtests/simd-fneg.clif
index 7b56dee100eb..6703e5281159 100644
--- a/cranelift/filetests/filetests/runtests/simd-fneg.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fneg.clif
@@ -9,6 +9,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 function %fneg_f32x4(f32x4) -> f32x4 {
 block0(v0: f32x4):
diff --git a/cranelift/filetests/filetests/runtests/simd-fsub.clif b/cranelift/filetests/filetests/runtests/simd-fsub.clif
index 0322ec2ebf49..9eadc2f38466 100644
--- a/cranelift/filetests/filetests/runtests/simd-fsub.clif
+++ b/cranelift/filetests/filetests/runtests/simd-fsub.clif
@@ -8,6 +8,10 @@ target x86_64 sse42 has_avx
 set enable_multi_ret_implicit_sret
 target riscv64 has_v
 target riscv64 has_v has_c has_zcb
+target pulley32
+target pulley32be
+target pulley64
+target pulley64be
 
 
 function %fsub_f32x4(f32x4, f32x4) -> f32x4 {
diff --git a/crates/math/src/lib.rs b/crates/math/src/lib.rs
index fee95bc10af3..008cedfa7d79 100644
--- a/crates/math/src/lib.rs
+++ b/crates/math/src/lib.rs
@@ -31,7 +31,7 @@ pub trait WasmFloat {
     fn wasm_nearest(self) -> Self;
     fn wasm_minimum(self, other: Self) -> Self;
     fn wasm_maximum(self, other: Self) -> Self;
-    fn mul_add(self, b: Self, c: Self) -> Self;
+    fn wasm_mul_add(self, b: Self, c: Self) -> Self;
 }
 
 impl WasmFloat for f32 {
@@ -148,9 +148,11 @@ impl WasmFloat for f32 {
         }
     }
     #[inline]
-    fn mul_add(self, b: f32, c: f32) -> f32 {
+    fn wasm_mul_add(self, b: f32, c: f32) -> f32 {
+        // The MinGW implementation of `fma` differs from other platforms, so
+        // favor `libm` there instead.
         #[cfg(feature = "std")]
-        if true {
+        if !(cfg!(windows) && cfg!(target_env = "gnu")) {
             return self.mul_add(b, c);
         }
         libm::fmaf(self, b, c)
@@ -271,9 +273,11 @@ impl WasmFloat for f64 {
         }
     }
     #[inline]
-    fn mul_add(self, b: f64, c: f64) -> f64 {
+    fn wasm_mul_add(self, b: f64, c: f64) -> f64 {
+        // The MinGW implementation of `fma` differs from other platforms, so
+        // favor `libm` there instead.
         #[cfg(feature = "std")]
-        if true {
+        if !(cfg!(windows) && cfg!(target_env = "gnu")) {
             return self.mul_add(b, c);
         }
         libm::fma(self, b, c)
diff --git a/crates/wasmtime/src/runtime/vm/libcalls.rs b/crates/wasmtime/src/runtime/vm/libcalls.rs
index cb4d6c04d842..ef9572ea7d8b 100644
--- a/crates/wasmtime/src/runtime/vm/libcalls.rs
+++ b/crates/wasmtime/src/runtime/vm/libcalls.rs
@@ -1289,11 +1289,11 @@ pub mod relocs {
     }
 
     pub extern "C" fn fmaf32(a: f32, b: f32, c: f32) -> f32 {
-        wasmtime_math::WasmFloat::mul_add(a, b, c)
+        wasmtime_math::WasmFloat::wasm_mul_add(a, b, c)
     }
 
     pub extern "C" fn fmaf64(a: f64, b: f64, c: f64) -> f64 {
-        wasmtime_math::WasmFloat::mul_add(a, b, c)
+        wasmtime_math::WasmFloat::wasm_mul_add(a, b, c)
     }
 
     // This intrinsic is only used on x86_64 platforms as an implementation of
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 657303433c54..d023a006114b 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -401,17 +401,9 @@ impl WastTest {
         // features in Pulley are implemented.
         if config.compiler == Compiler::CraneliftPulley {
             let unsupported = [
-                "misc_testsuite/simd/canonicalize-nan.wast",
-                "misc_testsuite/simd/issue_3327_bnot_lowering.wast",
                 "misc_testsuite/simd/v128-select.wast",
                 "spec_testsuite/proposals/relaxed-simd/i32x4_relaxed_trunc.wast",
-                "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast",
-                "spec_testsuite/proposals/memory64/relaxed_madd_nmadd.wast",
                 "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast",
-                "spec_testsuite/simd_f32x4_cmp.wast",
-                "spec_testsuite/simd_f32x4_pmin_pmax.wast",
-                "spec_testsuite/simd_f64x2_cmp.wast",
-                "spec_testsuite/simd_f64x2_pmin_pmax.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f32x4.wast",
                 "spec_testsuite/simd_i32x4_trunc_sat_f64x2.wast",
                 "spec_testsuite/simd_load.wast",
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 8c9a8dbc1d81..9202acb6a8c4 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -4681,4 +4681,114 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         self.state[operands.dst].set_u16x8(a);
         ControlFlow::Continue(())
     }
+
+    fn veqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vneqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a != b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vltf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a < b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vlteqf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        let mut c = [0; 4];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a <= b { u32::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u32x4(c);
+        ControlFlow::Continue(())
+    }
+
+    fn veqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a == b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vneqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a != b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vltf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a < b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vlteqf64x2(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let a = self.state[operands.src1].get_f64x2();
+        let b = self.state[operands.src2].get_f64x2();
+        let mut c = [0; 2];
+        for ((a, b), c) in a.iter().zip(&b).zip(&mut c) {
+            *c = if a <= b { u64::MAX } else { 0 };
+        }
+        self.state[operands.dst].set_u64x2(c);
+        ControlFlow::Continue(())
+    }
+
+    fn vfma32x4(&mut self, dst: VReg, a: VReg, b: VReg, c: VReg) -> ControlFlow<Done> {
+        let mut a = self.state[a].get_f32x4();
+        let b = self.state[b].get_f32x4();
+        let c = self.state[c].get_f32x4();
+        for ((a, b), c) in a.iter_mut().zip(b).zip(c) {
+            *a = a.wasm_mul_add(b, c);
+        }
+        self.state[dst].set_f32x4(a);
+        ControlFlow::Continue(())
+    }
+
+    fn vfma64x2(&mut self, dst: VReg, a: VReg, b: VReg, c: VReg) -> ControlFlow<Done> {
+        let mut a = self.state[a].get_f64x2();
+        let b = self.state[b].get_f64x2();
+        let c = self.state[c].get_f64x2();
+        for ((a, b), c) in a.iter_mut().zip(b).zip(c) {
+            *a = a.wasm_mul_add(b, c);
+        }
+        self.state[dst].set_f64x2(a);
+        ControlFlow::Continue(())
+    }
 }
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index 236346345264..777e54d7f4a3 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -1256,6 +1256,28 @@ macro_rules! for_each_extended_op {
             vavground8x16 = Vavground8x16 { operands: BinaryOperands<VReg> };
             /// `dst = (src1 + src2 + 1) // 2`
             vavground16x8 = Vavground16x8 { operands: BinaryOperands<VReg> };
+
+            /// `dst = src == dst`
+            veqf32x4 = VeqF32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src != dst`
+            vneqf32x4 = VneqF32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src < dst`
+            vltf32x4 = VltF32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src <= dst`
+            vlteqf32x4 = VlteqF32x4 { operands: BinaryOperands<VReg> };
+            /// `dst = src == dst`
+            veqf64x2 = VeqF64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src != dst`
+            vneqf64x2 = VneqF64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src < dst`
+            vltf64x2 = VltF64x2 { operands: BinaryOperands<VReg> };
+            /// `dst = src <= dst`
+            vlteqf64x2 = VlteqF64x2 { operands: BinaryOperands<VReg> };
+
+            /// `dst = ieee_fma(a, b, c)`
+            vfma32x4 = Vfma32x4 { dst: VReg, a: VReg, b: VReg, c: VReg };
+            /// `dst = ieee_fma(a, b, c)`
+            vfma64x2 = Vfma64x2 { dst: VReg, a: VReg, b: VReg, c: VReg };
         }
     };
 }