[PATCH 5/5]middle-end: Use addhn for compression instead of inclusive OR when reducing comparison values

Tamar Christina Mon, 18 Aug 2025 21:50:04 -0700

Given a sequence such as

int foo ()
{
#pragma GCC unroll 4
  for (int i = 0; i < N; i++)
    if (a[i] == 124)
      return 1;


  return 0;
}

where a[i] is long long, we will unroll the loop and use an OR reduction for
early break on Adv. SIMD.  Afterwards the sequence is followed by a compression
sequence to compress the 128-bit vectors into 64-bits for use by the branch.

However if we have support for add halfing and narrowing then we can instead of
using an OR, use an ADDHN which will do the combining and narrowing.

Note that for now I only do the last OR, however if we have more than one level
of unrolling we could technically chain them.  I will revisit this in another
up coming early break series, however an unroll of 2 is fairly common.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues and about a 10% improvements
in this sequence for Adv. SIMD.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-stmts.cc (vectorizable_early_exit): Use addhn if supported.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-early-break-addhn_1.c: New test.
        * gcc.target/aarch64/vect-early-break-addhn_2.c: New test.
        * gcc.target/aarch64/vect-early-break-addhn_3.c: New test.
        * gcc.target/aarch64/vect-early-break-addhn_4.c: New test.

---
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c 
b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0fce36f277f389d5f43174e398b8800ab11b31da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE int
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     cmeq    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     addhn   v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 8
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c 
b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9c781620749c1bd4ea6b0290d862f8ff5c84e6db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE long long
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     cmeq    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**     addhn   v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 4
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c 
b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0cebe9bdf4a1b8ba576f9c04fc7d2b8d79b97a9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE short
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**     ...
+**     ldp     q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**     cmeq    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     cmeq    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     addhn   v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+**     fmov    x[0-9]+, d[0-9]+
+**     ...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c 
b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9e35329cb271d38eff845c49df406f8501870b36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+
+#define TYPE char
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+int foo ()
+{
+#pragma GCC unroll 32
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
97b3d4801d19f3168b91c91271e882bad3f99f13..a1ecce8ea227654907c59828ff34c177cf680061
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12292,7 +12292,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
   gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
   gcond *cond_stmt = as_a <gcond *>(orig_stmt);
 
-  tree cst = build_zero_cst (vectype);
+  tree vectype_out = vectype;
   auto bb = gimple_bb (cond_stmt);
   edge exit_true_edge = EDGE_SUCC (bb, 0);
   if (exit_true_edge->flags & EDGE_FALSE_VALUE)
@@ -12416,12 +12416,40 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
       else
        workset.splice (stmts);
 
+      /* See if we support ADDHN and use that for the reduction.  */
+      internal_fn ifn = IFN_VEC_ADD_HALFING_NARROW_LO;
+      bool addhn_supported_p
+       = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED);
+      tree narrow_type = NULL_TREE;
+      if (addhn_supported_p)
+       {
+         /* Calculate the narrowing type for the result.  */
+         auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2;
+         auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype));
+         tree itype = build_nonstandard_integer_type (halfprec, unsignedp);
+         poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+         tree tmp_type = build_vector_type (itype, nunits);
+         narrow_type = truth_type_for (tmp_type);
+       }
+
       while (workset.length () > 1)
        {
-         new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
          tree arg0 = workset.pop ();
          tree arg1 = workset.pop ();
-         new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+         if (addhn_supported_p && workset.length () == 0)
+           {
+             new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1);
+             vectype_out = narrow_type;
+             new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+             gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp);
+             gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true);
+           }
+         else
+           {
+             new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+             new_stmt
+               = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+           }
          vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
                                       &cond_gsi);
          workset.quick_insert (0, new_temp);
@@ -12444,6 +12472,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
 
   gcc_assert (new_temp);
 
+  tree cst = build_zero_cst (vectype_out);
   gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
   update_stmt (orig_stmt);
 


--

diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0fce36f277f389d5f43174e398b8800ab11b31da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE int
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**	...
+**	ldp	q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**	cmeq	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	cmeq	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+**	fmov	x[0-9]+, d[0-9]+
+**	...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 8
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..9c781620749c1bd4ea6b0290d862f8ff5c84e6db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE long long
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**	...
+**	ldp	q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**	cmeq	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	cmeq	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+**	fmov	x[0-9]+, d[0-9]+
+**	...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 4
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..0cebe9bdf4a1b8ba576f9c04fc7d2b8d79b97a9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define TYPE short
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+/*
+** foo:
+**	...
+**	ldp	q[0-9]+, q[0-9]+, \[x[0-9]+\], 32
+**	cmeq	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**	cmeq	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+**	fmov	x[0-9]+, d[0-9]+
+**	...
+*/
+
+int foo ()
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..9e35329cb271d38eff845c49df406f8501870b36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */
+
+#define TYPE char
+#define N 800
+
+#pragma GCC target "+nosve"
+
+TYPE a[N];
+
+int foo ()
+{
+#pragma GCC unroll 32
+  for (int i = 0; i < N; i++)
+    if (a[i] == 124)
+      return 1;
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_ADD_HALFING_NARROW_LO" "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 97b3d4801d19f3168b91c91271e882bad3f99f13..a1ecce8ea227654907c59828ff34c177cf680061 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12292,7 +12292,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
   gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
   gcond *cond_stmt = as_a <gcond *>(orig_stmt);
 
-  tree cst = build_zero_cst (vectype);
+  tree vectype_out = vectype;
   auto bb = gimple_bb (cond_stmt);
   edge exit_true_edge = EDGE_SUCC (bb, 0);
   if (exit_true_edge->flags & EDGE_FALSE_VALUE)
@@ -12416,12 +12416,40 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
       else
 	workset.splice (stmts);
 
+      /* See if we support ADDHN and use that for the reduction.  */
+      internal_fn ifn = IFN_VEC_ADD_HALFING_NARROW_LO;
+      bool addhn_supported_p
+	= direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED);
+      tree narrow_type = NULL_TREE;
+      if (addhn_supported_p)
+	{
+	  /* Calculate the narrowing type for the result.  */
+	  auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2;
+	  auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype));
+	  tree itype = build_nonstandard_integer_type (halfprec, unsignedp);
+	  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+	  tree tmp_type = build_vector_type (itype, nunits);
+	  narrow_type = truth_type_for (tmp_type);
+	}
+
       while (workset.length () > 1)
 	{
-	  new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
 	  tree arg0 = workset.pop ();
 	  tree arg1 = workset.pop ();
-	  new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+	  if (addhn_supported_p && workset.length () == 0)
+	    {
+	      new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1);
+	      vectype_out = narrow_type;
+	      new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+	      gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp);
+	      gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true);
+	    }
+	  else
+	    {
+	      new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
+	      new_stmt
+		= gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
+	    }
 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
 				       &cond_gsi);
 	  workset.quick_insert (0, new_temp);
@@ -12444,6 +12472,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 
   gcc_assert (new_temp);
 
+  tree cst = build_zero_cst (vectype_out);
   gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
   update_stmt (orig_stmt);

[PATCH 5/5]middle-end: Use addhn for compression instead of inclusive OR when reducing comparison values

Reply via email to