https://gcc.gnu.org/g:1c46a541c6957e8b0eee339d4cff46e951a5ad4e
commit r15-4889-g1c46a541c6957e8b0eee339d4cff46e951a5ad4e Author: Kyrylo Tkachov <ktkac...@nvidia.com> Date: Mon Nov 4 07:25:16 2024 -0800 PR 117048: simplify-rtx: Simplify (X << C1) [+,^] (X >> C2) into ROTATE This is, in effect, a reapplication of de2bc6a7367aca2eecc925ebb64cfb86998d89f3 fixing the compile-time hog in var-tracking due to calling simplify_rtx on the two arms of the rotation before detecting the ROTATE. That is not necessary. simplify-rtx can transform (X << C1) | (X >> C2) into ROTATE (X, C1) when C1 + C2 == mode-width. But the transformation is also valid for PLUS and XOR. Indeed GIMPLE can also do the fold. Let's teach RTL to do it too. The motivating testcase for this is in AArch64 intrinsics: uint64x2_t G2(uint64x2_t a, uint64x2_t b) { uint64x2_t c = veorq_u64(a, b); return veorq_u64(vaddq_u64(c, c), vshrq_n_u64(c, 63)); } which I was hoping to fold to a single XAR (a ROTATE+XOR instruction) but GCC was failing to detect the rotate operation for two reasons: 1) The combination of the two arms of the expression is done under XOR rather than IOR that simplify-rtx currently supports. 2) The ASHIFT operation is actually a (PLUS X X) operation and thus is not detected as the LHS of the two arms we require. The patch fixes both issues. The analysis of the two arms of the rotation expression is factored out into a common helper simplify_rotate_op which is then used in the PLUS, XOR, IOR cases in simplify_binary_operation_1. The check-assembly testcase for this is added in the following patch because it needs some extra AArch64 backend work, but I've added self-tests in this patch to validate the transformation. Bootstrapped and tested on aarch64-none-linux-gnu Signed-off-by: Kyrylo Tkachov <ktac...@nvidia.com> PR target/117048 * simplify-rtx.cc (extract_ashift_operands_p): Define. (simplify_rotate_op): Likewise. (simplify_context::simplify_binary_operation_1): Use the above in the PLUS, IOR, XOR cases. (test_vector_rotate): Define. (test_vector_ops): Use the above. Diff: --- gcc/simplify-rtx.cc | 204 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 156 insertions(+), 48 deletions(-) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index ce8d3879270d..893c5f6e1ae0 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -2820,6 +2820,104 @@ reverse_rotate_by_imm_p (machine_mode mode, unsigned int left, rtx op1) return false; } +/* Analyse argument X to see if it represents an (ASHIFT X Y) operation + and return the expression to be shifted in SHIFT_OPND and the shift amount + in SHIFT_AMNT. This is primarily used to group handling of ASHIFT (X, CST) + and (PLUS (X, X)) in one place. If the expression is not equivalent to an + ASHIFT then return FALSE and set SHIFT_OPND and SHIFT_AMNT to NULL. */ + +static bool +extract_ashift_operands_p (rtx x, rtx *shift_opnd, rtx *shift_amnt) +{ + if (GET_CODE (x) == ASHIFT) + { + *shift_opnd = XEXP (x, 0); + *shift_amnt = XEXP (x, 1); + return true; + } + if (GET_CODE (x) == PLUS && rtx_equal_p (XEXP (x, 0), XEXP (x, 1))) + { + *shift_opnd = XEXP (x, 0); + *shift_amnt = CONST1_RTX (GET_MODE (x)); + return true; + } + *shift_opnd = NULL_RTX; + *shift_amnt = NULL_RTX; + return false; +} + +/* OP0 and OP1 are combined under an operation of mode MODE that can + potentially result in a ROTATE expression. Analyze the OP0 and OP1 + and return the resulting ROTATE expression if so. Return NULL otherwise. + This is used in detecting the patterns (X << C1) [+,|,^] (X >> C2) where + C1 + C2 == GET_MODE_UNIT_PRECISION (mode). + (X << C1) and (C >> C2) would be OP0 and OP1. */ + +static rtx +simplify_rotate_op (rtx op0, rtx op1, machine_mode mode) +{ + /* Convert (ior (ashift A CX) (lshiftrt A CY)) where CX+CY equals the + mode size to (rotate A CX). */ + + rtx opleft = op0; + rtx opright = op1; + rtx ashift_opnd, ashift_amnt; + /* In some cases the ASHIFT is not a direct ASHIFT. Look deeper and extract + the relevant operands here. */ + bool ashift_op_p + = extract_ashift_operands_p (op1, &ashift_opnd, &ashift_amnt); + + if (ashift_op_p + || GET_CODE (op1) == SUBREG) + { + opleft = op1; + opright = op0; + } + else + { + opright = op1; + opleft = op0; + ashift_op_p + = extract_ashift_operands_p (opleft, &ashift_opnd, &ashift_amnt); + } + + if (ashift_op_p && GET_CODE (opright) == LSHIFTRT + && rtx_equal_p (ashift_opnd, XEXP (opright, 0))) + { + rtx leftcst = unwrap_const_vec_duplicate (ashift_amnt); + rtx rightcst = unwrap_const_vec_duplicate (XEXP (opright, 1)); + + if (CONST_INT_P (leftcst) && CONST_INT_P (rightcst) + && (INTVAL (leftcst) + INTVAL (rightcst) + == GET_MODE_UNIT_PRECISION (mode))) + return gen_rtx_ROTATE (mode, XEXP (opright, 0), ashift_amnt); + } + + /* Same, but for ashift that has been "simplified" to a wider mode + by simplify_shift_const. */ + scalar_int_mode int_mode, inner_mode; + + if (GET_CODE (opleft) == SUBREG + && is_a <scalar_int_mode> (mode, &int_mode) + && is_a <scalar_int_mode> (GET_MODE (SUBREG_REG (opleft)), + &inner_mode) + && GET_CODE (SUBREG_REG (opleft)) == ASHIFT + && GET_CODE (opright) == LSHIFTRT + && GET_CODE (XEXP (opright, 0)) == SUBREG + && known_eq (SUBREG_BYTE (opleft), SUBREG_BYTE (XEXP (opright, 0))) + && GET_MODE_SIZE (int_mode) < GET_MODE_SIZE (inner_mode) + && rtx_equal_p (XEXP (SUBREG_REG (opleft), 0), + SUBREG_REG (XEXP (opright, 0))) + && CONST_INT_P (XEXP (SUBREG_REG (opleft), 1)) + && CONST_INT_P (XEXP (opright, 1)) + && (INTVAL (XEXP (SUBREG_REG (opleft), 1)) + + INTVAL (XEXP (opright, 1)) + == GET_MODE_PRECISION (int_mode))) + return gen_rtx_ROTATE (int_mode, XEXP (opright, 0), + XEXP (SUBREG_REG (opleft), 1)); + return NULL_RTX; +} + /* Subroutine of simplify_binary_operation. Simplify a binary operation CODE with result mode MODE, operating on OP0 and OP1. If OP0 and/or OP1 are constant pool references, TRUEOP0 and TRUEOP1 represent the @@ -2831,7 +2929,7 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, rtx op0, rtx op1, rtx trueop0, rtx trueop1) { - rtx tem, reversed, opleft, opright, elt0, elt1; + rtx tem, reversed, elt0, elt1; HOST_WIDE_INT val; scalar_int_mode int_mode, inner_mode; poly_int64 offset; @@ -3030,6 +3128,11 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return simplify_gen_unary (NEG, mode, reversed, mode); + /* Convert (plus (ashift A CX) (lshiftrt A CY)) where CX+CY equals the + mode size to (rotate A CX). */ + if ((tem = simplify_rotate_op (op0, op1, mode))) + return tem; + /* If one of the operands is a PLUS or a MINUS, see if we can simplify this by the associative law. Don't use the associative law for floating point. @@ -3462,53 +3565,10 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return op1; /* Convert (ior (ashift A CX) (lshiftrt A CY)) where CX+CY equals the - mode size to (rotate A CX). */ - - if (GET_CODE (op1) == ASHIFT - || GET_CODE (op1) == SUBREG) - { - opleft = op1; - opright = op0; - } - else - { - opright = op1; - opleft = op0; - } - - if (GET_CODE (opleft) == ASHIFT && GET_CODE (opright) == LSHIFTRT - && rtx_equal_p (XEXP (opleft, 0), XEXP (opright, 0))) - { - rtx leftcst = unwrap_const_vec_duplicate (XEXP (opleft, 1)); - rtx rightcst = unwrap_const_vec_duplicate (XEXP (opright, 1)); - - if (CONST_INT_P (leftcst) && CONST_INT_P (rightcst) - && (INTVAL (leftcst) + INTVAL (rightcst) - == GET_MODE_UNIT_PRECISION (mode))) - return gen_rtx_ROTATE (mode, XEXP (opright, 0), XEXP (opleft, 1)); - } - - /* Same, but for ashift that has been "simplified" to a wider mode - by simplify_shift_const. */ - - if (GET_CODE (opleft) == SUBREG - && is_a <scalar_int_mode> (mode, &int_mode) - && is_a <scalar_int_mode> (GET_MODE (SUBREG_REG (opleft)), - &inner_mode) - && GET_CODE (SUBREG_REG (opleft)) == ASHIFT - && GET_CODE (opright) == LSHIFTRT - && GET_CODE (XEXP (opright, 0)) == SUBREG - && known_eq (SUBREG_BYTE (opleft), SUBREG_BYTE (XEXP (opright, 0))) - && GET_MODE_SIZE (int_mode) < GET_MODE_SIZE (inner_mode) - && rtx_equal_p (XEXP (SUBREG_REG (opleft), 0), - SUBREG_REG (XEXP (opright, 0))) - && CONST_INT_P (XEXP (SUBREG_REG (opleft), 1)) - && CONST_INT_P (XEXP (opright, 1)) - && (INTVAL (XEXP (SUBREG_REG (opleft), 1)) - + INTVAL (XEXP (opright, 1)) - == GET_MODE_PRECISION (int_mode))) - return gen_rtx_ROTATE (int_mode, XEXP (opright, 0), - XEXP (SUBREG_REG (opleft), 1)); + mode size to (rotate A CX). */ + tem = simplify_rotate_op (op0, op1, mode); + if (tem) + return tem; /* If OP0 is (ashiftrt (plus ...) C), it might actually be a (sign_extend (plus ...)). Then check if OP1 is a CONST_INT and @@ -3838,6 +3898,12 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return tem; } + /* Convert (xor (ashift A CX) (lshiftrt A CY)) where CX+CY equals the + mode size to (rotate A CX). */ + tem = simplify_rotate_op (op0, op1, mode); + if (tem) + return tem; + /* Convert (xor (and (not A) B) A) into A | B. */ if (GET_CODE (op0) == AND && GET_CODE (XEXP (op0, 0)) == NOT @@ -8684,6 +8750,46 @@ test_vec_merge (machine_mode mode) simplify_rtx (nvm)); } +/* Test that vector rotate formation works at RTL level. Try various + combinations of (REG << C) [|,^,+] (REG >> (<bitwidth> - C)). */ + +static void +test_vector_rotate (rtx reg) +{ + machine_mode mode = GET_MODE (reg); + unsigned bitwidth = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT; + rtx plus_rtx = gen_rtx_PLUS (mode, reg, reg); + rtx lshftrt_amnt = GEN_INT (bitwidth - 1); + lshftrt_amnt = gen_const_vec_duplicate (mode, lshftrt_amnt); + rtx lshiftrt_rtx = gen_rtx_LSHIFTRT (mode, reg, lshftrt_amnt); + rtx rotate_rtx = gen_rtx_ROTATE (mode, reg, CONST1_RTX (mode)); + /* Test explicitly the case where ASHIFT (x, 1) is a PLUS (x, x). */ + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_IOR (mode, plus_rtx, lshiftrt_rtx))); + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_XOR (mode, plus_rtx, lshiftrt_rtx))); + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_PLUS (mode, plus_rtx, lshiftrt_rtx))); + + /* Don't go through every possible rotate amount to save execution time. + Multiple of BITS_PER_UNIT amounts could conceivably be simplified to + other bswap operations sometimes. Go through just the odd amounts. */ + for (unsigned i = 3; i < bitwidth - 2; i += 2) + { + rtx rot_amnt = gen_const_vec_duplicate (mode, GEN_INT (i)); + rtx ashift_rtx = gen_rtx_ASHIFT (mode, reg, rot_amnt); + lshftrt_amnt = gen_const_vec_duplicate (mode, GEN_INT (bitwidth - i)); + lshiftrt_rtx = gen_rtx_LSHIFTRT (mode, reg, lshftrt_amnt); + rotate_rtx = gen_rtx_ROTATE (mode, reg, rot_amnt); + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_IOR (mode, ashift_rtx, lshiftrt_rtx))); + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_XOR (mode, ashift_rtx, lshiftrt_rtx))); + ASSERT_RTX_EQ (rotate_rtx, + simplify_rtx (gen_rtx_PLUS (mode, ashift_rtx, lshiftrt_rtx))); + } +} + /* Test subregs of integer vector constant X, trying elements in the range [ELT_BIAS, ELT_BIAS + constant_lower_bound (NELTS)), where NELTS is the number of elements in X. Subregs involving @@ -8855,11 +8961,13 @@ test_vector_ops () { rtx scalar_reg = make_test_reg (GET_MODE_INNER (mode)); test_vector_ops_duplicate (mode, scalar_reg); + rtx vector_reg = make_test_reg (mode); if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && maybe_gt (GET_MODE_NUNITS (mode), 2)) { test_vector_ops_series (mode, scalar_reg); test_vector_subregs (mode); + test_vector_rotate (vector_reg); } test_vec_merge (mode); }