https://gcc.gnu.org/g:b6cef51eaf142eb14e32544d982db0d186f348d9
commit b6cef51eaf142eb14e32544d982db0d186f348d9 Author: Michael Meissner <meiss...@linux.ibm.com> Date: Fri Nov 8 13:17:45 2024 -0500 Add power9 and power10 float to logical optimizations. 2024-11-08 Michael Meissner <meiss...@linux.ibm.com> gcc/ PR target/117487 * config/rs6000/rs6000.cc (sf_logical_op_p): Delete. * config/rs6000/rs6000.h (sf_logical_op_p): Likewise. * config/rs6000/vsx.md (SFmode logical peephoole): Update comments in the original code that supports power8. Add a new define_peephole2 to do the optimization on power9/power10. Diff: --- gcc/config/rs6000/rs6000.cc | 62 ----------------- gcc/config/rs6000/rs6000.h | 1 - gcc/config/rs6000/vsx.md | 161 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 155 insertions(+), 69 deletions(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index e1ec9591a0eb..aa67e7256bb9 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -29564,68 +29564,6 @@ rs6000_opaque_type_invalid_use_p (gimple *stmt) return false; } -bool -sf_logical_op_p (rtx operands[]) -{ - if (!TARGET_POWERPC64 || !TARGET_DIRECT_MOVE) - { - fprintf (stderr, "!TARGET_POWERPC64 || !TARGET_DIRECT_MOVE\n"); - return false; - } - - /* The REG_P (xxx) tests prevents SUBREG's, which allows us to use REGNO - to compare registers, when the mode is different. */ - if (!REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D])) - { - fprintf (stderr, "REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D]))\n"); - return false; - } - - if (!REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D])) - { - fprintf (stderr, "!REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D])\n"); - return false; - } - - if (!REG_P (operands[SFBOOL_SHL_A]) && REG_P (operands[SFBOOL_MTVSR_D])) - { - fprintf (stderr, "!REG_P (operands[SFBOOL_SHL_A]) && REG_P (operands[SFBOOL_MTVSR_D])\n"); - return false; - } - - if (!REG_P (operands[SFBOOL_BOOL_A2]) - && !CONST_INT_P (operands[SFBOOL_BOOL_A2])) - { - fprintf (stderr, "!REG_P (operands[SFBOOL_BOOL_A2]) && !CONST_INT_P (operands[SFBOOL_BOOL_A2])\n"); - return false; - } - - if (!REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D]) - && !peep2_reg_dead_p (2, operands[SFBOOL_MFVSR_D])) - { - fprintf (stderr, "!REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D]) && !peep2_reg_dead_p (2, operands[SFBOOL_MFVSR_D])\n"); - return false; - } - - if (((REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A1]) - || (REG_P (operands[SFBOOL_BOOL_A2]) - && REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A2]))) - && REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_SHL_A]) - && (REGNO (operands[SFBOOL_SHL_D]) == REGNO (operands[SFBOOL_BOOL_D]) - || peep2_reg_dead_p (3, operands[SFBOOL_BOOL_D])) - && peep2_reg_dead_p (4, operands[SFBOOL_SHL_D]))) - { - fprintf (stderr, "last test passed\n"); - return true; - } - else - { - fprintf (stderr, "last test failed\n"); - return false; - } -} - - struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-rs6000.h" diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index 499e80fda08d..197005af5195 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -2526,4 +2526,3 @@ enum { #undef ARCH_EXPAND #endif /* GCC_HWINT_H. */ -extern bool sf_logical_op_p (rtx operands[]); diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index bcf8e2a60462..bfa1516768bc 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -6262,7 +6262,7 @@ (SFBOOL_MFVSR_A 3) ;; move to gpr src (SFBOOL_BOOL_D 4) ;; and/ior/xor dest (SFBOOL_BOOL_A1 5) ;; and/ior/xor arg1 - (SFBOOL_BOOL_A2 6) ;; and/ior/xor arg1 + (SFBOOL_BOOL_A2 6) ;; and/ior/xor arg2 (SFBOOL_SHL_D 7) ;; shift left dest (SFBOOL_SHL_A 8) ;; shift left arg (SFBOOL_MTVSR_D 9) ;; move to vecter dest @@ -6302,18 +6302,18 @@ ;; GPR, and instead move the integer mask value to the vector register after a ;; shift and do the VSX logical operation. -;; The insns for dealing with SFmode in GPR registers looks like: +;; The insns for dealing with SFmode in GPR registers looks like on power8: ;; (set (reg:V4SF reg2) (unspec:V4SF [(reg:SF reg1)] UNSPEC_VSX_CVDPSPN)) ;; -;; (set (reg:DI reg3) (unspec:DI [(reg:V4SF reg2)] UNSPEC_P8V_RELOAD_FROM_VSX)) +;; (set (reg:DI reg3) (zero_extend:DI (reg:SI reg2))) ;; -;; (set (reg:DI reg4) (and:DI (reg:DI reg3) (reg:DI reg3))) +;; (set (reg:DI reg4) (and:SI (reg:SI reg3) (reg:SI mask))) ;; ;; (set (reg:DI reg5) (ashift:DI (reg:DI reg4) (const_int 32))) ;; ;; (set (reg:SF reg6) (unspec:SF [(reg:DI reg5)] UNSPEC_P8V_MTVSRD)) ;; -;; (set (reg:SF reg6) (unspec:SF [(reg:SF reg6)] UNSPEC_VSX_CVSPDPN)) +;; (set (reg:SF reg7) (unspec:SF [(reg:SF reg6)] UNSPEC_VSX_CVSPDPN)) (define_peephole2 [(match_scratch:DI SFBOOL_TMP_GPR "r") @@ -6338,7 +6338,24 @@ (set (match_operand:SF SFBOOL_MTVSR_D "vsx_register_operand") (unspec:SF [(match_dup SFBOOL_SHL_D)] UNSPEC_P8V_MTVSRD))] - "sf_logical_op_p (operands)" + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE + /* The REG_P (xxx) tests prevents SUBREG's, which allows us to use REGNO + to compare registers, when the mode is different. */ + && REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D]) + && REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D]) + && REG_P (operands[SFBOOL_SHL_A]) && REG_P (operands[SFBOOL_MTVSR_D]) + && (REG_P (operands[SFBOOL_BOOL_A2]) + || CONST_INT_P (operands[SFBOOL_BOOL_A2])) + && (REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D]) + || peep2_reg_dead_p (2, operands[SFBOOL_MFVSR_D])) + && (REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A1]) + || (REG_P (operands[SFBOOL_BOOL_A2]) + && REGNO (operands[SFBOOL_MFVSR_D]) + == REGNO (operands[SFBOOL_BOOL_A2]))) + && REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_SHL_A]) + && (REGNO (operands[SFBOOL_SHL_D]) == REGNO (operands[SFBOOL_BOOL_D]) + || peep2_reg_dead_p (3, operands[SFBOOL_BOOL_D])) + && peep2_reg_dead_p (4, operands[SFBOOL_SHL_D])" [(set (match_dup SFBOOL_TMP_GPR) (ashift:DI (match_dup SFBOOL_BOOL_A_DI) (const_int 32))) @@ -6377,6 +6394,138 @@ operands[SFBOOL_MTVSR_D_V4SF] = gen_rtx_REG (V4SFmode, regno_mtvsr_d); }) +;; Constants for SFbool optimization on power9/power10 +(define_constants + [(SFBOOL2_TMP_VSX_V4SI 0) ;; vector temporary (V4SI) + (SFBOOL2_TMP_GPR_SI 1) ;; GPR temporary (SI) + (SFBOOL2_MFVSR_D 2) ;; move to gpr dest (DI) + (SFBOOL2_MFVSR_A 3) ;; move to gpr src (SI) + (SFBOOL2_BOOL_D 4) ;; and/ior/xor dest (SI) + (SFBOOL2_BOOL_A1 5) ;; and/ior/xor arg1 (SI) + (SFBOOL2_BOOL_A2 6) ;; and/ior/xor arg2 (SI) + (SFBOOL2_SPLAT_D 7) ;; splat dest (V4SI) + (SFBOOL2_MTVSR_D 8) ;; move/splat to VSX dest. + (SFBOOL2_MTVSR_A 9) ;; move/splat to VSX arg. + (SFBOOL2_MFVSR_A_V4SI 10) ;; MFVSR_A as V4SI + (SFBOOL2_MTVSR_D_V4SI 11) ;; MTVSR_D as V4SI + (SFBOOL2_XXSPLTW 12)]) ;; 1 or 3 for XXSPLTW + +;; On power9/power10, the code is different because we have a splat 32-bit +;; operation that does a direct move to the FPR/vector registers (MTVSRWS). +;; +;; The insns for dealing with SFmode in GPR registers looks like on +;; power9/power10: +;; +;; (set (reg:V4SF reg2) (unspec:V4SF [(reg:SF reg1)] UNSPEC_VSX_CVDPSPN)) +;; +;; (set (reg:DI reg3) (zero_extend:DI (reg:SI reg2))) +;; +;; (set (reg:SI reg4) (and:SI (reg:SI reg3) (reg:SI mask))) +;; +;; (set (reg:V4SI reg5) (vec_duplicate:V4SI (reg:SI reg4))) +;; +;; (set (reg:SF reg6) (unspec:SF [(reg:SF reg5)] UNSPEC_VSX_CVSPDPN)) + +;; The VSX temporary needs to be an Altivec register in case we are trying to +;; do and/ior/xor of -16..15 and we want to use VSPLTISW to load the constant. +;; +;; The GPR temporary is only used if we are trying to do a logical operation +;; with a constant outside of the -16..15 range on a power9. Otherwise, we can +;; load the constant directly into the VSX temporary register. + +(define_peephole2 + [(match_scratch:V4SI SFBOOL2_TMP_VSX_V4SI "v") + (match_scratch:SI SFBOOL2_TMP_GPR_SI "r") + + ;; Zero_extend and direct move + (set (match_operand:DI SFBOOL2_MFVSR_D "int_reg_operand") + (zero_extend:DI + (match_operand:SI SFBOOL2_MFVSR_A "vsx_register_operand"))) + + ;; AND/IOR/XOR operation on int + (set (match_operand:SI SFBOOL2_BOOL_D "int_reg_operand") + (and_ior_xor:SI + (match_operand:SI SFBOOL2_BOOL_A1 "int_reg_operand") + (match_operand:SI SFBOOL2_BOOL_A2 "reg_or_cint_operand"))) + + ;; Splat sfbool result to vector register + (set (match_operand:V4SI SFBOOL2_SPLAT_D "vsx_register_operand") + (vec_duplicate:V4SI + (match_dup SFBOOL2_BOOL_D)))] + + "TARGET_POWERPC64 && TARGET_P9_VECTOR + && REG_P (operands[SFBOOL2_MFVSR_D]) + && REG_P (operands[SFBOOL2_BOOL_A1]) + && (REGNO (operands[SFBOOL2_MFVSR_D]) == REGNO (operands[SFBOOL2_BOOL_A1]) + || (REG_P (operands[SFBOOL2_BOOL_A2]) + && (REGNO (operands[SFBOOL2_MFVSR_D]) + == REGNO (operands[SFBOOL2_BOOL_A2])))) + && peep2_reg_dead_p (3, operands[SFBOOL2_MFVSR_D]) + && peep2_reg_dead_p (4, operands[SFBOOL2_BOOL_D])" + + ;; Either (set (reg:SI xxx) (reg:SI yyy)) or + ;; (set (reg:V4SI xxx) (const_vector (parallel [c, c, c, c]))) + [(set (match_dup SFBOOL2_MTVSR_D) + (match_dup SFBOOL2_MTVSR_A)) + + ;; And/ior/xor on vector registers + (set (match_dup SFBOOL2_TMP_VSX_V4SI) + (and_ior_xor:V4SI + (match_dup SFBOOL2_MFVSR_A_V4SI) + (match_dup SFBOOL2_TMP_VSX_V4SI))) + + ;; XXSPLTW t,r,r,1 + (set (match_dup SFBOOL2_SPLAT_D) + (vec_duplicate:V4SI + (vec_select:SI + (match_dup SFBOOL2_TMP_VSX_V4SI) + (parallel [(match_dup SFBOOL2_XXSPLTW)]))))] +{ + rtx mfvsr_d = operands[SFBOOL2_MFVSR_D]; + rtx bool_a1 = operands[SFBOOL2_BOOL_A1]; + rtx bool_a2 = operands[SFBOOL2_BOOL_A2]; + rtx bool_arg = (rtx_equal_p (mfvsr_d, bool_a1) ? bool_a2 : bool_a1); + int regno_mfvsr_a = REGNO (operands[SFBOOL2_MFVSR_A]); + int regno_tmp_vsx = REGNO (operands[SFBOOL2_TMP_VSX_V4SI]); + + /* If the logical operation is a constant, form the constant in a vector + register. */ + if (CONST_INT_P (bool_arg)) + { + HOST_WIDE_INT value = INTVAL (bool_arg); + + /* See if we can directly load the constant, either by VSPLTIW or by + XXSPLTIW on power10. */ + + if (IN_RANGE (value, -16, 15) || TARGET_PREFIXED) + { + rtvec cv = gen_rtvec (4, bool_arg, bool_arg, bool_arg, bool_arg); + operands[SFBOOL2_MTVSR_D] = gen_rtx_REG (V4SImode, regno_tmp_vsx); + operands[SFBOOL2_MTVSR_A] = gen_rtx_CONST_VECTOR (V4SImode, cv); + } + + else + { + /* We need to load up the constant to a GPR and move it to a + vector register. */ + rtx tmp_gpr = operands[SFBOOL2_TMP_GPR_SI]; + emit_move_insn (tmp_gpr, bool_arg); + operands[SFBOOL2_MTVSR_D] = gen_rtx_REG (SImode, regno_tmp_vsx); + operands[SFBOOL2_MTVSR_A] = tmp_gpr; + } + } + else + { + /* Mask is in a register, move it to a vector register. */ + operands[SFBOOL2_MTVSR_D] = gen_rtx_REG (SImode, regno_tmp_vsx); + operands[SFBOOL2_MTVSR_A] = bool_arg; + } + + operands[SFBOOL2_TMP_VSX_V4SI] = gen_rtx_REG (V4SImode, regno_tmp_vsx); + operands[SFBOOL2_MFVSR_A_V4SI] = gen_rtx_REG (V4SImode, regno_mfvsr_a); + operands[SFBOOL2_XXSPLTW] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 2); +}) + ;; Support signed/unsigned long long to float conversion vectorization. ;; Note that any_float (pc) here is just for code attribute <su>. (define_expand "vec_pack<su>_float_v2di"