From: Avinash Jayakar <[email protected]>

Hi,

Here is a small patch to do better codegen in powerpc for %, [fl]% and
[cl]% operators.
Kindly review. Is this OK for trunk?

Changes from v1:
- Added test cases to check the vectorization
Thanks and regards,
Avinash Jayakar

The modulo operator and its floor/ceil variants %, [fl]% and [cl]% would
not auto-vectorize in powerpc. Because during vectorization, the vector
cost would be unfairly declared costlier than scalar loop. i.e., the
scalar cost of % operator would be considered 1 despite in assembly it
generating 6 instructions, and thus in vector pass with O2 flag, it
would consider scalar loop as better and abandon vectorization.

This patch adjusts the cost of the 3 operators as seen in the assembly,
and thus generates faster-running code when these operators are seen in
a loop.

Suppose the source is

for (int i=0; i<N; i++)
  a[i] = a[i] % CONST;

The inner basic block of the loop would emit following asm:
.L2:
        lwzu 10,4(4)
        mulhw 8,10,5
        srawi 9,10,31
        srawi 8,8,3
        subf 9,9,8
        mulli 9,9,19
        subf 9,9,10
        stwu 9,4(3)
        bdnz .L2
        blr

After fine tuning the cost we see
.L2:
        lxvd2x 45,0,3
        vmulosw 10,13,11
        vmulesw 0,13,11
        vsraw 1,13,9
        vmrgew 0,0,10
        vsraw 0,0,8
        vsubuwm 0,0,1
        vslw 1,0,12
        vadduwm 1,1,0
        vslw 1,1,12
        vsubuwm 0,1,0
        vsubuwm 13,13,0
        stxvd2x 45,0,3
        addi 3,3,16
        bdnz .L2
        blr

Although the code size increases, the runtime performance is almost 4x
better than scalar code.

2025-11-28  Avinash Jayakar  <[email protected]>

        PR 121700

gcc/ChangeLog:

        * config/rs6000/rs6000.cc (rs6000_adjust_vect_cost_per_stmt):
        Fine-grain adjustment of %, [fl]% and [cl]% ops.

gcc/testsuite/ChangeLog:

        * gcc.target/powerpc/pr121700-ceil-mod.c: New test.
        * gcc.target/powerpc/pr121700-floor-mod.c: New test.
        * gcc.target/powerpc/pr121700-trunc-mod.c: New test.
        * gcc.target/powerpc/pr121700.h: Test utility.
---
 gcc/config/rs6000/rs6000.cc                   |  7 +++
 .../gcc.target/powerpc/pr121700-ceil-mod.c    | 16 ++++++
 .../gcc.target/powerpc/pr121700-floor-mod.c   | 16 ++++++
 .../gcc.target/powerpc/pr121700-trunc-mod.c   | 16 ++++++
 gcc/testsuite/gcc.target/powerpc/pr121700.h   | 52 +++++++++++++++++++
 5 files changed, 107 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr121700.h

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 1d5cd25c0f0..181571f17ab 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -5308,6 +5308,13 @@ rs6000_adjust_vect_cost_per_stmt (enum 
vect_cost_for_stmt kind,
       tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
       if (subcode == COND_EXPR)
        return 2;
+/* For {FLOOR,TRUNC}_MOD_EXPR, cost them a bit higher in order to fairly
+   compare the scalar and vector costs, since there is optimal scalar 
instruction
+   that can evaluation these expressions with just 1 instruction. Currently
+   using the number of instructions generated for these expressions.*/
+      else if (subcode == FLOOR_MOD_EXPR || subcode == TRUNC_MOD_EXPR
+              || subcode == CEIL_MOD_EXPR)
+       return 6;
     }
 
   return 0;
diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c 
b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
new file mode 100644
index 00000000000..56ac1a48217
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr121700-ceil-mod.c
@@ -0,0 +1,16 @@
+/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-cpu=power8" } 
*/
+
+#include "pr121700.h"
+
+TEST_FN(__CEIL_MOD, 19, mod)
+
+int main (void)
+{
+  int *a = (int*)&arr;
+  init_arr(a, N);
+  mod(a);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" } } 
*/
diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c 
b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
new file mode 100644
index 00000000000..9198773b210
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr121700-floor-mod.c
@@ -0,0 +1,16 @@
+/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-cpu=power8" } 
*/
+
+#include "pr121700.h"
+
+TEST_FN(__FLOOR_MOD, 19, mod)
+
+int main (void)
+{
+  int *a = (int*)&arr;
+  init_arr(a, N);
+  mod(a);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" } } 
*/
diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c 
b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
new file mode 100644
index 00000000000..c1154b08a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr121700-trunc-mod.c
@@ -0,0 +1,16 @@
+/* { dg-options "-fgimple -fdump-tree-vect-details -O2 -mdejagnu-cpu=power8" } 
*/
+
+#include "pr121700.h"
+
+TEST_FN(%, 19, mod)
+
+int main (void)
+{
+  int *a = (int*)&arr;
+  init_arr(a, N);
+  mod(a);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" } } 
*/
diff --git a/gcc/testsuite/gcc.target/powerpc/pr121700.h 
b/gcc/testsuite/gcc.target/powerpc/pr121700.h
new file mode 100644
index 00000000000..1550f9f8f5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr121700.h
@@ -0,0 +1,52 @@
+#define TEST_FN(OP, CONST, NAME) \
+__attribute__((noinline)) \
+void __GIMPLE (ssa,guessed_local(10737416)) \
+NAME (int * a) \
+{ \
+  int i; \
+  long unsigned int _1; \
+  long unsigned int _2; \
+  int * _3; \
+  int _4; \
+  int _5; \
+  unsigned int _12; \
+  unsigned int _13; \
+ \
+  __BB(2,guessed_local(10737416)): \
+  goto __BB3(precise(134217728)); \
+ \
+  __BB(3,loop_header(1),guessed_local(1063004408)): \
+  i_14 = __PHI (__BB5: i_11, __BB2: 0); \
+  _13 = __PHI (__BB5: _12, __BB2: 1024u); \
+  _1 = (long unsigned int) i_14; \
+  _2 = _1 * 4ul; \
+  _3 = a_9(D) + _2; \
+  _4 = __MEM <int> (_3); \
+  _5 = _4 OP CONST; \
+  __MEM <int> (_3) = _5; \
+  i_11 = i_14 + 1; \
+  _12 = _13 - 1u; \
+  if (_12 != 0u) \
+    goto __BB5(guessed(132861994)); \
+  else \
+    goto __BB4(guessed(1355734)); \
+ \
+  __BB(5,guessed_local(1052266995)): \
+  goto __BB3(precise(134217728)); \
+ \
+  __BB(4,guessed_local(10737416)): \
+  return; \
+ \
+} \
+
+
+
+#define N 1024
+int arr[N];
+void init_arr (int *a, int n)
+{
+  #pragma GCC novector
+  for (int i=0; i<n; i++)
+    a[i] = i - n/2;
+}
+
-- 
2.51.0

Reply via email to