From 24a478faa5acc55ae37b4bd9c8ee9a0cbaeabd30 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
Date: Tue, 14 Oct 2025 18:46:19 +1100
Subject: [PATCH] [tree-optimization/61338][v2] - Optimize redundant reverse
 permutations in vectorized stores

This patch eliminates redundant reverse permutations in vectorized reverse
loops by detecting and optimizing patterns during store vectorization.

The reverse load (b[i]) generates PERM, operations are applied, then the
reverse store adds another PERM. This creates redundant permute pairs that
we now detect and eliminate.

	PR tree-optimization/61338

gcc/ChangeLog:
	(get_vector_perm_operand): New.
	(vect_find_reverse_permute_operand): Likewise.
	(vectorizable_store): Use helper to eliminate redundant
	reverse permutations.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/slp-permute-reverse-1.c: New test for basic
	reverse permute optimization (simple copy).
	* gcc.dg/vect/slp-permute-reverse-2.c: New runtime test for
	basic pattern.
Signed-off-by: Kugan Vivekanandarajah <kvivekananda@nvidia.com>
---
 .../gcc.dg/vect/slp-permute-reverse-1.c       |  21 +++
 .../gcc.dg/vect/slp-permute-reverse-2.c       |  53 +++++++
 gcc/tree-vect-stmts.cc                        | 134 ++++++++++++++++--
 gcc/tree-vectorizer.h                         |   5 +
 4 files changed, 199 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c

diff --git a/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
new file mode 100644
index 00000000000..045b09e8136
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details" } */
+
+
+#define N 32000
+
+void test_reverse_loop (float * __restrict__ a, 
+                        float * __restrict__ b)
+{
+  for (int i = N - 1; i >= 0; i--)
+    {
+      a[i] = b[i] + 1.0f;
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
+/* { dg-final { scan-tree-dump "Optimized back-to-back reverse permutes" "vect" } } */
+/* { dg-final { scan-assembler-times "tbl\\t" 0 { target aarch64*-*-* } } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c
new file mode 100644
index 00000000000..cf8ce512d85
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-permute-reverse-2.c
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-O3" } */
+
+/* Runtime test to verify correctness of redundant permute optimization.  */
+
+#define N 1024
+
+float a[N], b[N];
+
+__attribute__((noipa,noinline))
+void test_reverse (void)
+{
+  for (int i = N - 1; i >= 0; i--)
+    a[i] = b[i] + 1.0f;
+}
+
+__attribute__((noipa,noinline))
+void test_forward (void)
+{
+  for (int i = 0; i < N; i++)
+    a[i] = b[i] + 1.0f;
+}
+
+int main ()
+{
+  /* Initialize b array.  */
+  for (int i = 0; i < N; i++)
+    b[i] = (float)i;
+
+  /* Test reverse iteration.  */
+  test_reverse ();
+  
+  /* Verify results.  */
+  for (int i = 0; i < N; i++)
+    if (a[i] != b[i] + 1.0f)
+      __builtin_abort ();
+
+  /* Reset for forward test.  */
+  for (int i = 0; i < N; i++)
+    a[i] = 0.0f;
+
+  test_forward ();
+
+  /* Verify results.  */
+  for (int i = 0; i < N; i++)
+    if (a[i] != b[i] + 1.0f)
+      __builtin_abort ();
+
+  return 0;
+}
+
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index dcb25225b1b..cafc5a55dbe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -65,6 +65,95 @@ along with GCC; see the file COPYING3.  If not see
 static tree vector_vector_composition_type (tree, poly_uint64, tree *,
 					    bool = false);
 
+/* Recursively search for an SLP node whose representative matches stmt_info.  */
+
+static slp_tree
+find_slp_node_in_children (slp_tree node, stmt_vec_info stmt_info)
+{
+  if (!node)
+    return NULL;
+
+  if (SLP_TREE_REPRESENTATIVE (node) == stmt_info)
+    return node;
+
+  slp_tree child;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    {
+      slp_tree found = find_slp_node_in_children (child, stmt_info);
+      if (found)
+	return found;
+    }
+
+  return NULL;
+}
+
+/* Check if we can elide reverse permutes in a back-to-back load/store pattern.
+   Returns true if the optimization applies and marks both load and store to
+   skip reverse permute generation.  */
+
+static bool
+vect_optimize_reverse_permutes (vec_info *vinfo, stmt_vec_info stmt_info,
+				slp_tree slp_node)
+{
+  /* Get the RHS of the store statement.  */
+  tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+  if (TREE_CODE (rhs) != SSA_NAME)
+    return false;
+
+  gimple *def = SSA_NAME_DEF_STMT (rhs);
+  if (!def || !is_gimple_assign (def))
+    return false;
+
+  tree_code code = gimple_assign_rhs_code (def);
+  /* Direct copy or element-wise operations.  */
+  if (code != SSA_NAME
+      && TREE_CODE_CLASS (code) != tcc_unary
+      && TREE_CODE_CLASS (code) != tcc_binary)
+    return false;
+
+  tree op1 = gimple_assign_rhs1 (def);
+  if (TREE_CODE (op1) != SSA_NAME)
+    return false;
+
+  /* For binary operations, check that op2 is a constant.  */
+  if (TREE_CODE_CLASS (code) == tcc_binary)
+    {
+      tree op2 = gimple_assign_rhs2 (def);
+      if (TREE_CODE (op2) == INTEGER_CST)
+	return false;
+    }
+  stmt_vec_info op1_info = vinfo->lookup_def (op1);
+  if (!op1_info || !STMT_VINFO_DATA_REF (op1_info)
+      || !DR_IS_READ (STMT_VINFO_DATA_REF (op1_info)))
+    return false;
+
+  /* Check if the load result is used only once (by this operation).
+     If it has multiple uses, we can't skip the permute.  */
+  if (!has_single_use (op1))
+    return false;
+
+  /* Recursively find the SLP node for the load in the store's SLP tree.  */
+  slp_tree op1_node = find_slp_node_in_children (slp_node, op1_info);
+
+  if (!op1_node)
+    return false;
+
+  /* Check if this is also a permutation.  */
+  if (SLP_TREE_LOAD_PERMUTATION (op1_node).exists ())
+    return false;
+  /* Both are accesses in a reverse loop.
+     Mark both to skip reverse permutes.  */
+  op1_info->skip_reverse_permute = true;
+  stmt_info->skip_reverse_permute = true;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "Optimized back-to-back reverse permutes\n");
+
+  return true;
+}
+
 /* Return TRUE iff the given statement is in an inner loop relative to
    the loop being vectorized.  */
 bool
@@ -7885,7 +7974,6 @@ vectorizable_scan_store (vec_info *vinfo, stmt_vec_info stmt_info,
   return true;
 }
 
-
 /* Function vectorizable_store.
 
    Check if STMT_INFO defines a non scalar data-ref (array/pointer/structure)
@@ -8172,7 +8260,7 @@ vectorizable_store (vec_info *vinfo,
       tree offvar = NULL_TREE;
       tree ivstep;
       tree running_off;
-      tree stride_base, stride_step, alias_off;
+      tree stride_base, stride_step = NULL_TREE, alias_off;
       tree vec_oprnd = NULL_TREE;
       tree dr_offset;
       /* Checked by get_load_store_type.  */
@@ -9073,6 +9161,12 @@ vectorizable_store (vec_info *vinfo,
 
   new_stmt = NULL;
   gcc_assert (!grouped_store);
+  /* During the initial analysis or code generation (when cost_vec is NULL),
+     check if we can elide reverse permute by analyzing the scalar statement pattern.
+     For code generation, check the flags; for initial analysis, set them.  */
+  if (costing_p && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+    vect_optimize_reverse_permutes (vinfo, stmt_info, slp_node);
+
   for (i = 0; i < vec_num; i++)
     {
       if (!costing_p)
@@ -9080,10 +9174,15 @@ vectorizable_store (vec_info *vinfo,
 
       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 	{
-	  if (costing_p)
+	  /* Check if the reverse permute should be skipped.  This flag is set
+	     during costing when we detect a back-to-back load/store pattern.  */
+	  bool skip_permute = stmt_info->skip_reverse_permute;
+
+	  /* Generate the reverse permute if not skipped.  */
+	  if (costing_p && !skip_permute)
 	    inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
 					     slp_node, 0, vect_body);
-	  else
+	  else if (!costing_p && !skip_permute)
 	    {
 	      tree perm_mask = perm_mask_for_reverse (vectype);
 	      tree new_temp = make_ssa_name (vectype);
@@ -9449,7 +9548,7 @@ vectorizable_load (vec_info *vinfo,
   gphi *phi = NULL;
   vec<tree> dr_chain = vNULL;
   bool grouped_load = false;
-  stmt_vec_info first_stmt_info;
+  stmt_vec_info first_stmt_info = NULL;
   stmt_vec_info first_stmt_info_for_drptr = NULL;
   bool compute_in_loop = false;
   class loop *at_loop;
@@ -9859,7 +9958,7 @@ vectorizable_load (vec_info *vinfo,
       tree ivstep;
       tree running_off;
       vec<constructor_elt, va_gc> *v = NULL;
-      tree stride_base, stride_step, alias_off;
+      tree stride_base, stride_step = NULL_TREE, alias_off;
       /* Checked by get_load_store_type.  */
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
@@ -11517,15 +11616,22 @@ vectorizable_load (vec_info *vinfo,
 
       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 	{
-	  if (costing_p)
-	    inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-					    slp_node, 0, vect_body);
-	  else
+	  /* Check if the reverse permute should be skipped.  This flag is set
+	     during store costing when we detect a back-to-back load/store pattern.  */
+	  bool skip_permute = stmt_info->skip_reverse_permute;
+
+	  if (!skip_permute)
 	    {
-	      tree perm_mask = perm_mask_for_reverse (vectype);
-	      new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
-					       perm_mask, stmt_info, gsi);
-	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
+	      if (costing_p)
+		inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
+						slp_node, 0, vect_body);
+	      else
+		{
+		  tree perm_mask = perm_mask_for_reverse (vectype);
+		  new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
+						   perm_mask, stmt_info, gsi);
+		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
+		}
 	    }
 	}
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index fdb76ad4d4d..94002304f18 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1619,6 +1619,11 @@ public:
   /* True if this is a pattern that can only be handled by SLP
      vectorization.  */
   bool slp_vect_pattern_only_p;
+
+  /* For loads/stores with VMAT_CONTIGUOUS_REVERSE, indicates that the
+     reverse permute should be skipped because it cancels with a reverse
+     permute in the paired store/load.  */
+  bool skip_reverse_permute;
 };
 
 /* Information about a gather/scatter call.  */
-- 
2.50.1 (Apple Git-155)

