Re: [Patch] PR rtl-optimization/87763 - generate more bfi instructions on aarch64

Steve Ellcey Wed, 06 Feb 2019 15:42:29 -0800

On Tue, 2019-02-05 at 21:12 +0000, Wilco Dijkstra wrote:

> +bool
> +aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
> +                                  unsigned HOST_WIDE_INT mask1,
> +                                  unsigned HOST_WIDE_INT shft_amnt,
> +                                  unsigned HOST_WIDE_INT mask2)
> +{
> +  unsigned HOST_WIDE_INT t;
> +
> +  /* Verify that there is no overlap in what bits are set in the two
> masks.  */
> +  if ((mask1 + mask2) != HOST_WIDE_INT_M1U)
> +    return false;
> +  if ((mask1 & mask2) != 0)
> +    return false;
> 
> Why not check mask1 == ~mask2? That's much simpler...


Yes that would be simpler.  I made that change.
> 
> +  /* Verify that the shift amount is less than the mode size.  */
> +  if (shft_amnt >= GET_MODE_BITSIZE (mode))
> +    return false;
> 
> The md pattern already guarantees this (this could be an assert). We need
> to check that  shift+width <= mode size. Given immediates are limited to
> the mode size, the easiest way is to check mask2 is not 0 or M1.

OK, I changed the if statement to a gcc_assert and added a check to
make sure mask2 is not 0 or M1.

> +  /* Verify that the mask being shifted is contiguous and would be in the
> +     least significant bits after shifting by creating a mask 't' based on
> +     the number of bits set in mask2 and the shift amount for mask2 and
> +     comparing that to the actual mask2.  */
> +  t = popcount_hwi (mask2);
> +  t = (HOST_WIDE_INT_1U << t) - 1;
> +  t = t << shft_amnt;
> +  if (t != mask2)
> +    return false;
> +
> +  return true;
> 
> This would return true if mask2 == 0 or M1 (I think this can't happen with
> current md patterns, but this would emit an illegal bfi).
> 
> After special cases you could do something like t = mask2 + (HWI_1U << shift);
> return t == (t & -t) to check for a valid bfi.

I am not sure I follow this logic and my attempts to use this did not
work so I kept my original code.

> +  "bfi\t%<GPI:w>0, %<GPI:w>1, 0, %P2"
> 
> This could emit a width that may be 32 too large in SImode if bit 31 is set
> (there is another use of %P in aarch64.md which may have the same
> issue).

I am not sure why having bit 31 set would be a problem.  Sign
extension?

> Finally from some quick testing it seems one case is not yet
> supported:
> 
> int f1(int x, int y)
> {
>   return (y & 0xfffffff) | (((x <<28) & 0xf0000000));
> }
> 
> This just has a shift, rather than an AND plus a shift.

I added another instruction to handle this and added a new test to
check for it.

Steve Ellcey
[email protected]


Here is the latest version of the patch.


2018-02-06  Steve Ellcey  <[email protected]>

        PR rtl-optimization/87763
        * config/aarch64/aarch64-protos.h (aarch64_masks_and_shift_for_bfi_p):
        New prototype.
        * config/aarch64/aarch64.c (aarch64_masks_and_shift_for_bfi_p):
        New function.
        * config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>5_shift):
        New instruction.
        (*aarch64_bfi<GPI:mode>4_noand): Ditto.
        (*aarch64_bfi<GPI:mode>4_noshift): Ditto.
        (*aarch64_bfi<GPI:mode>4_noshift_alt): Ditto.

2018-02-06  Steve Ellcey  <[email protected]>

        PR rtl-optimization/87763
        * gcc.target/aarch64/combine_bfxil.c: Change some bfxil checks
        to bfi.
        * gcc.target/aarch64/combine_bfi_2.c: New test.

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b035e35f33b..b6c0d0a8eb6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -429,6 +429,9 @@ bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
 bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
+bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d7c453cdad0..26b5ab47d6f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9330,6 +9330,42 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
 }
 
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+				   unsigned HOST_WIDE_INT mask1,
+				   unsigned HOST_WIDE_INT shft_amnt,
+				   unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if (mask1 != ~mask2)
+    return false;
+
+  /* Verify that mask2 is not all zeros or ones.  */
+  if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
+    return false;
+
+  /* The shift amount should always be less than the mode size.  */
+  gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by creating a mask 't' based on
+     the number of bits set in mask2 and the shift amount for mask2 and
+     comparing that to the actual mask2.  */
+  t = popcount_hwi (mask2);
+  t = (HOST_WIDE_INT_1U << t) - 1;
+  t = t << shft_amnt;
+  if (t != mask2)
+    return false;
+  
+  return true;
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b7f6fe0f135..2bbd3f1055c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5476,6 +5476,76 @@
   [(set_attr "type" "bfm")]
 )
 
+;;  Match a bfi instruction where the shift of OP3 means that we are
+;;  actually copying the least significant bits of OP3 into OP0 by way
+;;  of the AND masks and the IOR instruction.  A similar instruction
+;;  with the two parts of the IOR swapped around was never triggered
+;;  in a bootstrap build and test of GCC so it was not included.
+
+(define_insn "*aarch64_bfi<GPI:mode>5_shift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (ashift:GPI
+                           (match_operand:GPI 3 "register_operand" "r")
+                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
+                          (match_operand:GPI 5 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      UINTVAL(operands[5]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5"
+  [(set_attr "type" "bfm")]
+)
+
+;; Like *aarch64_bfi<GPI:mode>5_shift but with no and of the ashift because
+;; the shift is large enough to remove the need for an AND instruction.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noand"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (ashift:GPI
+                          (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      HOST_WIDE_INT_M1U << UINTVAL (operands[4]) )"
+{
+  operands[5] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - UINTVAL (operands[4]));
+  return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %5";
+}
+  [(set_attr "type" "bfm")]
+)
+
+;; Like *aarch64_bfi<GPI:mode>5_shift but with no shifting, we are just
+;; copying the least significant bits of OP3 to OP0.  In this case we do
+;; need two versions of the instruction to handle different checks on the
+;; constant values.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]), 0,
+				      UINTVAL (operands[4]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4"
+  [(set_attr "type" "bfm")]
+)
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift_alt"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "0")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[4]), 0,
+				      UINTVAL (operands[2]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>1, 0, %P2"
+  [(set_attr "type" "bfm")]
+)
+
 (define_insn "*extr_insv_lower_reg<mode>"
   [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
 			  (match_operand 1 "const_int_operand" "n")

diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
index e69de29bb2d..145282d4d55 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int f1(int x, int y)
+{
+  return (y & 0xfffffff) | (((x <<28) & 0xf0000000));
+}
+
+
+int f2(int x, int y)
+{
+  return (((x <<28) & 0xf0000000)) | (y & 0xfffffff);
+}
+
+/* { dg-final { scan-assembler-times {\tbfi\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
index 109f989a2f0..c3b5fc58800 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -114,4 +114,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler-times "bfxil\\t" 18 } } */
+/* { dg-final { scan-assembler-times "bfxil\\t" 3 } } */
+/* { dg-final { scan-assembler-times "bfi\\t" 15 } } */

Re: [Patch] PR rtl-optimization/87763 - generate more bfi instructions on aarch64

Reply via email to