riscv64: Improve codegen for icmp (bytecodealliance#7203)

* riscv64: Constants are always sign-extended Skip generating extra sign-extension instructions in this case because the materialization of a constant will implicitly sign-extend into the entire register. * riscv64: Rename `lower_int_compare` helper Try to reserve `lower_*` as taking a thing and producing a `*Reg`. Rename this helper to `is_nonzero_cmp` to represent how it's testing for nonzero and producing a comparison. * riscv64: Rename some float comparison helpers * `FCmp` -> `FloatCompare` * `emit_fcmp` -> `fcmp_to_float_compare` * `lower_fcmp` -> `lower_float_compare` Make some room for upcoming integer comparison functions. * riscv64: Remove `ordered` helper This is only used by one lowering so inline its definition directly. * riscv64: Remove the `Icmp` pseudo-instruction This commit is the first in a few steps to reimplement bytecodealliance#6112 and bytecodealliance#7113. The `Icmp` pseudo-instruction is removed here and necessary functionality is all pushed into ISLE lowerings. This enables deleting the `lower_br_icmp` helper in `emit.rs` as it's no longer necessary, meaning that all conditional branches should ideally be generated in lowering rather than pseudo-instructions to benefit from various optimizations. Currently the lowering is the bare-bones minimum to get things working. This involved creating a helper to lower an `IntegerCompare` into an `XReg` via negation/swapping args/etc. In generated code this removes branches and makes `icmp` a straight-line instruction for non-128-bit arguments. * riscv64: Remove an unused helper * riscv64: Optimize comparisons with 0 Use the `x0` register which is always zero where possible and avoid unnecessary `xor`s against this register. * riscv64: Specialize `a < $imm` * riscv64: Optimize Equal/NotEqual against constants * riscv64: Optimize LessThan with constant argument * Optimize `a <= $imm` * riscv64: Optimize `a >= $imm` * riscv64: Add comment for new helper * Use i64 in icmp optimizations Matches the sign-extension that happens at the hardware layer. * Correct some sign extensions * riscv64: Don't assume immediates are extended * riscv64: Fix encoding for `c.addi4spn` * riscv64: Remove `icmp` lowerings which modify constants These aren't correct and will need updating * Add regression test * riscv64: Fix handling unsigned comparisons with constants --------- Co-authored-by: Afonso Bordado <[email protected]>
alexcrichton · Oct 16, 2023 · 4f49393 · 4f49393
1 parent e1d66be
commit 4f49393
Show file tree

Hide file tree

Showing 64 changed files with 1,897 additions and 1,370 deletions.
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -241,13 +241,6 @@
       (addr Reg)
       (v Reg)
       (ty Type))
-    ;; an integer compare.
-    (Icmp
-      (cc IntCC)
-      (rd WritableReg)
-      (a ValueRegs)
-      (b ValueRegs)
-      (ty Type))
     (FcvtToInt
       (is_sat bool)
       (rd WritableReg)
@@ -1191,6 +1184,12 @@
 (rule (rv_andi rs1 imm)
   (alu_rr_imm12 (AluOPRRI.Andi) rs1 imm))
 
+;; Helper for emitting the `slt` ("Set Less Than") instruction.
+;; rd ← rs1 < rs2
+(decl rv_slt (XReg XReg) XReg)
+(rule (rv_slt rs1 rs2)
+  (alu_rrr (AluOPRRR.Slt) rs1 rs2))
+
 ;; Helper for emitting the `sltu` ("Set Less Than Unsigned") instruction.
 ;; rd ← rs1 < rs2
 (decl rv_sltu (XReg XReg) XReg)
@@ -1203,6 +1202,12 @@
 (rule (rv_snez rs1)
   (rv_sltu (zero_reg) rs1))
 
+;; Helper for emiting the `slti` ("Set Less Than Immediate") instruction.
+;; rd ← rs1 < imm
+(decl rv_slti (XReg Imm12) XReg)
+(rule (rv_slti rs1 imm)
+  (alu_rr_imm12 (AluOPRRI.Slti) rs1 imm))
+
 ;; Helper for emiting the `sltiu` ("Set Less Than Immediate Unsigned") instruction.
 ;; rd ← rs1 < imm
 (decl rv_sltiu (XReg Imm12) XReg)
@@ -1786,6 +1791,10 @@
 (decl imm12_const_add (i32 i32) Imm12)
 (extern constructor imm12_const_add imm12_const_add)
 
+;; Performs a fallible add of the `Imm12` value and the 32-bit value provided.
+(decl pure partial imm12_add (Imm12 i32) Imm12)
+(extern constructor imm12_add imm12_add)
+
 (decl imm12_and (Imm12 u64) Imm12)
 (extern constructor imm12_and imm12_and)
 
@@ -2667,10 +2676,6 @@
       (high XReg (rv_sub high_tmp borrow)))
     (value_regs low high)))
 
-;; int scalar zero regs.
-(decl int_zero_reg (Type) ValueRegs)
-(extern constructor int_zero_reg int_zero_reg)
-
 ;; Consume a CmpResult, producing a branch on its result.
 (decl cond_br (IntegerCompare CondBrTarget CondBrTarget) SideEffectNoResult)
 (rule (cond_br cmp then else)
@@ -2739,7 +2744,7 @@
 ;; This is used in `Select` and `brif` for example to generate conditional
 ;; branches. The returned comparison, when taken, represents that `Value` is
 ;; nonzero. When not taken the input `Value` is zero.
-(decl lower_int_compare (Value) IntegerCompare)
+(decl is_nonzero_cmp (Value) IntegerCompare)
 
 ;; Base case - convert to a "truthy" value and compare it against zero.
 ;;
@@ -2750,46 +2755,72 @@
 ;; Additionally the base 64-bit ISA has a single instruction for sign-extending
 ;; from 32 to 64-bits which makes that a bit cheaper if used.
 ;; of registers sign-extend the results.
-(rule 0 (lower_int_compare val @ (value_type (fits_in_64 _)))
+(rule 0 (is_nonzero_cmp val @ (value_type (fits_in_64 _)))
   (cmp_nez (sext val)))
-(rule 1 (lower_int_compare val @ (value_type $I8))
+(rule 1 (is_nonzero_cmp val @ (value_type $I8))
   (cmp_nez (zext val)))
-(rule 1 (lower_int_compare val @ (value_type $I128))
+(rule 1 (is_nonzero_cmp val @ (value_type $I128))
   (cmp_nez (rv_or (value_regs_get val 0) (value_regs_get val 1))))
 
-;; If the input value is itself an `icmp` we can avoid generating the result of
-;; the `icmp` and instead move the comparison directly into the `IntegerCompare`
-;; that's returned. Note that comparisons compare full registers so
-;; sign-extension according to the integer comparison performed here is
-;; required.
+;; If the input value is itself an `icmp` or `fcmp` we can avoid generating the
+;; result of the comparison and instead move the comparison directly into the
+;; `IntegerCompare` that's returned.
+(rule 2 (is_nonzero_cmp (maybe_uextend (icmp cc a b @ (value_type (fits_in_64 _)))))
+  (icmp_to_int_compare cc a b))
+(rule 2 (is_nonzero_cmp (maybe_uextend (fcmp cc a @ (value_type ty) b)))
+  (fcmp_to_float_compare cc ty a b))
+
+;; Creates an `IntegerCompare` from an `icmp` node's parts. This will extend
+;; values as necessary to their full register width to perform the
+;; comparison. The returned `IntegerCompare` is suitable to use in conditional
+;; branches for example.
+;;
+;; Note that this should ideally only be used when the `IntegerCompare` returned
+;; is fed into a branch. If `IntegerCompare` is materialized this will miss out
+;; on optimizations to compare against constants using some native instructions.
+(decl icmp_to_int_compare (IntCC Value Value) IntegerCompare)
+(rule 0 (icmp_to_int_compare cc a b @ (value_type (fits_in_64 in_ty)))
+  (int_compare cc (put_value_in_reg_for_icmp cc a) (put_value_in_reg_for_icmp cc b)))
+(rule 1 (icmp_to_int_compare cc a b @ (value_type $I128))
+  (cmp_nez (lower_icmp_i128 cc a b)))
+
+;; Places a `Value` into a full register width to prepare for a comparison
+;; using `IntCC`.
 ;;
-;; Also note that as a small optimization `Equal` and `NotEqual` use
-;; sign-extension for 32-bit values since the same result is produced with
-;; either zero-or-sign extension and many values are already sign-extended given
-;; the RV64 instruction set (e.g. `addw` adds 32-bit values and sign extends),
-;; theoretically resulting in more efficient codegen.
-(rule 2 (lower_int_compare (maybe_uextend (icmp cc a b @ (value_type (fits_in_64 in_ty)))))
-  (int_compare cc (zext a) (zext b)))
-(rule 3 (lower_int_compare (maybe_uextend (icmp cc a b @ (value_type (fits_in_64 in_ty)))))
+;; This is largely a glorified means of choosing sign-extension or
+;; zero-extension for the `Value` input.
+(decl put_value_in_reg_for_icmp (IntCC Value) XReg)
+
+;; Base cases, use the `cc` to determine whether to zero or sign extend.
+(rule 0 (put_value_in_reg_for_icmp cc val)
+  (zext val))
+(rule 1 (put_value_in_reg_for_icmp cc val)
   (if (signed_cond_code cc))
-  (int_compare cc (sext a) (sext b)))
-(rule 4 (lower_int_compare (maybe_uextend (icmp cc @ (IntCC.Equal) a b @ (value_type $I32))))
-  (int_compare cc (sext a) (sext b)))
-(rule 4 (lower_int_compare (maybe_uextend (icmp cc @ (IntCC.NotEqual) a b @ (value_type $I32))))
-  (int_compare cc (sext a) (sext b)))
-
-;; If the input is an `fcmp` then the `FCmp` return value is directly
-;; convertible to `IntegerCompare` which can shave off an instruction from the
-;; fallback lowering above.
-(rule 2 (lower_int_compare (maybe_uextend (fcmp cc a @ (value_type ty) b)))
-  (emit_fcmp cc ty a b))
+  (sext val))
+
+;; For equality and inequality favor sign extension since it's generally
+;; easier to perform sign extension on RV64 via native instructions. For 8-bit
+;; types though use zero-extension since that's a single instruction `and`.
+(rule 2 (put_value_in_reg_for_icmp (IntCC.Equal) val @ (value_type (fits_in_64 _)))
+  (sext val))
+(rule 2 (put_value_in_reg_for_icmp (IntCC.NotEqual) val @ (value_type (fits_in_64 _)))
+  (sext val))
+(rule 3 (put_value_in_reg_for_icmp (IntCC.Equal) val @ (value_type $I8))
+  (zext val))
+(rule 3 (put_value_in_reg_for_icmp (IntCC.NotEqual) val @ (value_type $I8))
+  (zext val))
+
+;; As a special case use `x0` directly if a constant is 0.
+(rule 4 (put_value_in_reg_for_icmp _ (i64_from_iconst 0))
+  (zero_reg))
+
 
 (decl partial lower_branch (Inst MachLabelSlice) Unit)
 (rule (lower_branch (jump _) (single_target label))
       (emit_side_effect (rv_j label)))
 
 (rule (lower_branch (brif v _ _) (two_targets then else))
-  (emit_side_effect (cond_br (lower_int_compare v) then else)))
+  (emit_side_effect (cond_br (is_nonzero_cmp v) then else)))
 
 (decl lower_br_table (Reg MachLabelSlice) Unit)
 (extern constructor lower_br_table lower_br_table)
@@ -2943,8 +2974,9 @@
 (extern constructor sp_reg sp_reg)
 
 ;; Helper for creating the zero register.
-(decl zero_reg () Reg)
+(decl zero_reg () XReg)
 (extern constructor zero_reg zero_reg)
+(extern extractor zero_reg is_zero_reg)
 
 (decl value_regs_zero () ValueRegs)
 (rule (value_regs_zero)
@@ -2956,67 +2988,64 @@
 
 ;;;; Helpers for floating point comparisons ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(decl is_not_nan (Type FReg) XReg)
-(rule (is_not_nan ty a) (rv_feq ty a a))
-
-(decl ordered (Type FReg FReg) XReg)
-(rule (ordered ty a b) (rv_and (is_not_nan ty a) (is_not_nan ty b)))
-
-(type FCmp (enum
+(type FloatCompare (enum
   ;; The comparison succeeded if `r` is one
   (One (r XReg))
   ;; The comparison succeeded if `r` is zero
   (Zero (r XReg))
 ))
 
-(decl fcmp_invert (FCmp) FCmp)
-(rule (fcmp_invert (FCmp.One r)) (FCmp.Zero r))
-(rule (fcmp_invert (FCmp.Zero r)) (FCmp.One r))
+(decl float_compare_invert (FloatCompare) FloatCompare)
+(rule (float_compare_invert (FloatCompare.One r)) (FloatCompare.Zero r))
+(rule (float_compare_invert (FloatCompare.Zero r)) (FloatCompare.One r))
 
-(decl fcmp_to_compare (FCmp) IntegerCompare)
-(rule (fcmp_to_compare (FCmp.One r)) (cmp_nez r))
-(rule (fcmp_to_compare (FCmp.Zero r)) (cmp_eqz r))
-(convert FCmp IntegerCompare fcmp_to_compare)
+(decl float_to_int_compare (FloatCompare) IntegerCompare)
+(rule (float_to_int_compare (FloatCompare.One r)) (cmp_nez r))
+(rule (float_to_int_compare (FloatCompare.Zero r)) (cmp_eqz r))
+(convert FloatCompare IntegerCompare float_to_int_compare)
 
 ;; Compare two floating point numbers and return a zero/non-zero result.
-(decl emit_fcmp (FloatCC Type FReg FReg) FCmp)
+(decl fcmp_to_float_compare (FloatCC Type FReg FReg) FloatCompare)
 
 ;; Direct codegen for unordered comparisons is not that efficient, so invert
 ;; the comparison to get an ordered comparison and generate that. Then invert
 ;; the result to produce the final fcmp result.
-(rule 0 (emit_fcmp cc ty a b)
+(rule 0 (fcmp_to_float_compare cc ty a b)
   (if-let $true (floatcc_unordered cc))
-  (fcmp_invert (emit_fcmp (floatcc_complement cc) ty a b)))
+  (float_compare_invert (fcmp_to_float_compare (floatcc_complement cc) ty a b)))
 
 ;; a is not nan && b is not nan
-(rule 1 (emit_fcmp (FloatCC.Ordered) ty a b)
-  (FCmp.One (ordered ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.Ordered) ty a b)
+  (FloatCompare.One (rv_and (is_not_nan ty a) (is_not_nan ty b))))
+
+(decl is_not_nan (Type FReg) XReg)
+(rule (is_not_nan ty a) (rv_feq ty a a))
 
 ;; a == b
-(rule 1 (emit_fcmp (FloatCC.Equal) ty a b)
-  (FCmp.One (rv_feq ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.Equal) ty a b)
+  (FloatCompare.One (rv_feq ty a b)))
 
 ;; a != b
 ;; == !(a == b)
-(rule 1 (emit_fcmp (FloatCC.NotEqual) ty a b)
-  (FCmp.Zero (rv_feq ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.NotEqual) ty a b)
+  (FloatCompare.Zero (rv_feq ty a b)))
 
 ;; a < b || a > b
-(rule 1 (emit_fcmp (FloatCC.OrderedNotEqual) ty a b)
-  (FCmp.One (rv_or (rv_flt ty a b) (rv_fgt ty a b))))
+(rule 1 (fcmp_to_float_compare (FloatCC.OrderedNotEqual) ty a b)
+  (FloatCompare.One (rv_or (rv_flt ty a b) (rv_fgt ty a b))))
 
 ;; a < b
-(rule 1 (emit_fcmp (FloatCC.LessThan) ty a b)
-  (FCmp.One (rv_flt ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.LessThan) ty a b)
+  (FloatCompare.One (rv_flt ty a b)))
 
 ;; a <= b
-(rule 1 (emit_fcmp (FloatCC.LessThanOrEqual) ty a b)
-  (FCmp.One (rv_fle ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.LessThanOrEqual) ty a b)
+  (FloatCompare.One (rv_fle ty a b)))
 
 ;; a > b
-(rule 1 (emit_fcmp (FloatCC.GreaterThan) ty a b)
-  (FCmp.One (rv_fgt ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.GreaterThan) ty a b)
+  (FloatCompare.One (rv_fgt ty a b)))
 
 ;; a >= b
-(rule 1 (emit_fcmp (FloatCC.GreaterThanOrEqual) ty a b)
-  (FCmp.One (rv_fge ty a b)))
+(rule 1 (fcmp_to_float_compare (FloatCC.GreaterThanOrEqual) ty a b)
+  (FloatCompare.One (rv_fge ty a b)))