From: Andrew Pinski <apin...@marvell.com>

This patch adds simple folding of __builtin_aarch64_im_lane_boundsi where
we are not going to error out. It fixes the problem by the removal
of the function from the IR.

OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.

gcc/ChangeLog:

        PR target/95969
        * config/aarch64/aarch64-builtins.c (aarch64_fold_builtin_lane_check):
        New function.
        (aarch64_general_fold_builtin): Handle AARCH64_SIMD_BUILTIN_LANE_CHECK.
        (aarch64_general_gimple_fold_builtin): Likewise.

gcc/testsuite/ChangeLog:

        PR target/95969
        * gcc.target/aarch64/lane-bound-1.c: New test.
        * gcc.target/aarch64/lane-bound-2.c: New test.
---
 gcc/config/aarch64/aarch64-builtins.c         | 35 +++++++++++++++++++
 .../gcc.target/aarch64/lane-bound-1.c         | 14 ++++++++
 .../gcc.target/aarch64/lane-bound-2.c         | 10 ++++++
 3 files changed, 59 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/lane-bound-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/lane-bound-2.c

diff --git a/gcc/config/aarch64/aarch64-builtins.c 
b/gcc/config/aarch64/aarch64-builtins.c
index eef9fc0f444..119f67d4e4c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -29,6 +29,7 @@
 #include "rtl.h"
 #include "tree.h"
 #include "gimple.h"
+#include "ssa.h"
 #include "memmodel.h"
 #include "tm_p.h"
 #include "expmed.h"
@@ -2333,6 +2334,27 @@ aarch64_general_builtin_rsqrt (unsigned int fn)
   return NULL_TREE;
 }
 
+/* Return true if the lane check can be removed as there is no
+   error going to be emitted.  */
+static bool
+aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
+{
+  if (TREE_CODE (arg0) != INTEGER_CST)
+    return false;
+  if (TREE_CODE (arg1) != INTEGER_CST)
+    return false;
+  if (TREE_CODE (arg2) != INTEGER_CST)
+    return false;
+
+  auto totalsize = wi::to_widest (arg0);
+  auto elementsize = wi::to_widest (arg1);
+  if (totalsize == 0 || elementsize == 0)
+    return false;
+  auto lane = wi::to_widest (arg2);
+  auto high = wi::udiv_trunc (totalsize, elementsize);
+  return wi::ltu_p (lane, high);
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
@@ -2353,6 +2375,11 @@ aarch64_general_fold_builtin (unsigned int fcode, tree 
type,
       VAR1 (UNOP, floatv4si, 2, ALL, v4sf)
       VAR1 (UNOP, floatv2di, 2, ALL, v2df)
        return fold_build1 (FLOAT_EXPR, type, args[0]);
+      case AARCH64_SIMD_BUILTIN_LANE_CHECK:
+       gcc_assert (n_args == 3);
+       if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
+         return void_node;
+       break;
       default:
        break;
     }
@@ -2440,6 +2467,14 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt)
            }
          break;
        }
+    case AARCH64_SIMD_BUILTIN_LANE_CHECK:
+      if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
+       {
+         unlink_stmt_vdef (stmt);
+         release_defs (stmt);
+         new_stmt = gimple_build_nop ();
+       }
+      break;
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/aarch64/lane-bound-1.c 
b/gcc/testsuite/gcc.target/aarch64/lane-bound-1.c
new file mode 100644
index 00000000000..bbbe679fd80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/lane-bound-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+#include <arm_neon.h>
+
+void
+f (float32x4_t **ptr)
+{
+  float32x4_t res = vsetq_lane_f32 (0.0f, **ptr, 0);
+  **ptr = res;
+}
+/* GCC should be able to remove the call to "__builtin_aarch64_im_lane_boundsi"
+   and optimize out the second load from *ptr.  */
+/* { dg-final { scan-tree-dump-times "__builtin_aarch64_im_lane_boundsi" 0 
"optimized" } } */
+/* { dg-final { scan-tree-dump-times " = \\\*ptr_" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/lane-bound-2.c 
b/gcc/testsuite/gcc.target/aarch64/lane-bound-2.c
new file mode 100644
index 00000000000..923c94687c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/lane-bound-2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-original" } */
+void
+f (void)
+{
+  __builtin_aarch64_im_lane_boundsi (16, 4, 0);
+  __builtin_aarch64_im_lane_boundsi (8, 8, 0);
+}
+/* GCC should be able to optimize these out before gimplification. */
+/* { dg-final { scan-tree-dump-times "__builtin_aarch64_im_lane_boundsi" 0 
"original" } } */
-- 
2.17.1

Reply via email to