https://gcc.gnu.org/g:405c99c17210a58df1adddd237219e773e689f17

commit r15-6657-g405c99c17210a58df1adddd237219e773e689f17
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Mon Jan 6 17:52:14 2025 +0000

    perform affine fold to unsigned on non address expressions. [PR114932]
    
    When the patch for PR114074 was applied we saw a good boost in exchange2.
    
    This boost was partially caused by a simplification of the addressing modes.
    With the patch applied IV opts saw the following form for the base 
addressing;
    
      Base: (integer(kind=4) *) &block + ((sizetype) ((unsigned long) l0_19(D) *
    324) + 36)
    
    vs what we normally get:
    
      Base: (integer(kind=4) *) &block + ((sizetype) ((integer(kind=8)) l0_19(D)
    * 81) + 9) * 4
    
    This is because the patch promoted multiplies where one operand is a 
constant
    from a signed multiply to an unsigned one, to attempt to fold away the 
constant.
    
    This patch attempts the same but due to the various problems with SCEV and
    niters not being able to analyze the resulting forms (i.e. PR114322) we 
can't
    do it during SCEV or in the general form like in fold-const like 
extract_muldiv
    attempts.
    
    Instead this applies the simplification during IVopts initialization when we
    create the IV. This allows IV opts to see the simplified form without
    influencing the rest of the compiler.
    
    as mentioned in PR114074 it would be good to fix the missed optimization in 
the
    other passes so we can perform this in general.
    
    The reason this has a big impact on Fortran code is that Fortran doesn't 
seem to
    have unsigned integer types.  As such all it's addressing are created with
    signed types and folding does not happen on them due to the possible 
overflow.
    
    concretely on AArch64 this changes the results from generation:
    
            mov     x27, -108
            mov     x24, -72
            mov     x23, -36
            add     x21, x1, x0, lsl 2
            add     x19, x20, x22
    .L5:
            add     x0, x22, x19
            add     x19, x19, 324
            ldr     d1, [x0, x27]
            add     v1.2s, v1.2s, v15.2s
            str     d1, [x20, 216]
            ldr     d0, [x0, x24]
            add     v0.2s, v0.2s, v15.2s
            str     d0, [x20, 252]
            ldr     d31, [x0, x23]
            add     v31.2s, v31.2s, v15.2s
            str     d31, [x20, 288]
            bl      digits_20_
            cmp     x21, x19
            bne     .L5
    
    into:
    
    .L5:
            ldr     d1, [x19, -108]
            add     v1.2s, v1.2s, v15.2s
            str     d1, [x20, 216]
            ldr     d0, [x19, -72]
            add     v0.2s, v0.2s, v15.2s
            str     d0, [x20, 252]
            ldr     d31, [x19, -36]
            add     x19, x19, 324
            add     v31.2s, v31.2s, v15.2s
            str     d31, [x20, 288]
            bl      digits_20_
            cmp     x21, x19
            bne     .L5
    
    The two patches together results in a 10% performance increase in exchange2 
in
    SPECCPU 2017 and a 4% reduction in binary size and a 5% improvement in 
compile
    time. There's also a 5% performance improvement in fotonik3d and similar
    reduction in binary size.
    
    The patch folds every IV to unsigned to canonicalize them.  At the end of 
the
    pass we match.pd will then remove unneeded conversions.
    
    Note that we cannot force everything to unsigned, IVops requires that array
    address expressions remain as such.  Folding them results in them becoming
    pointer expressions for which some optimizations in IVopts do not run.
    
    gcc/ChangeLog:
    
            PR tree-optimization/114932
            * tree-ssa-loop-ivopts.cc (alloc_iv): Perform affine unsigned fold.
    
    gcc/testsuite/ChangeLog:
    
            PR tree-optimization/114932
            * gcc.dg/tree-ssa/pr64705.c: Update dump file scan.
            * gcc.target/i386/pr115462.c: The testcase shares 3 IVs which 
calculates
            the same thing but with a slightly different increment offset.  The 
test
            checks for 3 complex addressing loads, one for each IV.  But with 
this
            change they now all share one IV.  That is the loop now only has one
            complex addressing.  This is ultimately driven by the backend 
costing
            and the current costing says this is preferred so updating the 
testcase.
            * gfortran.dg/addressing-modes_1.f90: New test.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/pr64705.c          |  2 +-
 gcc/testsuite/gcc.target/i386/pr115462.c         |  2 +-
 gcc/testsuite/gfortran.dg/addressing-modes_1.f90 | 37 ++++++++++++++++++++++++
 gcc/tree-ssa-loop-ivopts.cc                      | 20 ++++++-------
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr64705.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr64705.c
index fd24e38a53e9..3c9c2e5deed1 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr64705.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr64705.c
@@ -23,4 +23,4 @@ int foo(char *flags, long len, long i, long steps)
 
 /* Don't expand iv {base+step, step}_loop into {base+x+y, step}_loop
    even if "step == x + y".  */
-/* { dg-final { scan-tree-dump "Base:\\tstep_\[0-9\]* \\+ 
iter|Base:\\titer_\[0-9\]* \\+ step" "ivopts"} } */
+/* { dg-final { scan-tree-dump {Base:\t\(unsigned ?.*\) step_[0-9]* \+ 
\(unsigned ?.*\) iter|Base:\t\(unsigned ?.*\) iter_[0-9]* \+ \(unsigned ?.*\) 
step} "ivopts"} } */
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c 
b/gcc/testsuite/gcc.target/i386/pr115462.c
index ad50a6382bc4..6526f99bf0e3 100644
--- a/gcc/testsuite/gcc.target/i386/pr115462.c
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
-/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } 
} */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 1 } 
} */
 
 int
 foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, 
long n, int* q)
diff --git a/gcc/testsuite/gfortran.dg/addressing-modes_1.f90 
b/gcc/testsuite/gfortran.dg/addressing-modes_1.f90
new file mode 100644
index 000000000000..334d5bc47a16
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/addressing-modes_1.f90
@@ -0,0 +1,37 @@
+! { dg-do compile { target aarch64-*-* } }
+! { dg-additional-options "-w -Ofast" }
+
+  module brute_force
+    integer, parameter :: r=9
+     integer  block(r, r, 0)
+    contains
+  subroutine brute
+     do
+      do
+          do
+           do
+                do
+                     do
+                         do i7 = l0, 1
+                       select case(1 )
+                       case(1)
+                           block(:2, 7:, 1) = block(:2, 7:, i7) - 1
+                       end select
+                            do i8 = 1, 1
+                               do i9 = 1, 1
+                            if(1 == 1) then
+                                    call digits_20
+                                end if
+                                end do
+                          end do
+                    end do
+                    end do
+              end do
+              end do
+           end do
+     end do
+  end do
+ end
+  end
+
+! { dg-final { scan-assembler-not {ldr\s+d([0-9]+),\s+\[x[0-9]+, x[0-9]+\]} } }
diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
index df1b6b871e39..989321137df9 100644
--- a/gcc/tree-ssa-loop-ivopts.cc
+++ b/gcc/tree-ssa-loop-ivopts.cc
@@ -1189,19 +1189,19 @@ alloc_iv (struct ivopts_data *data, tree base, tree 
step,
                                              sizeof (struct iv));
   gcc_assert (step != NULL_TREE);
 
-  /* Lower address expression in base except ones with DECL_P as operand.
-     By doing this:
+  /* Canonicalize the address expression in base if it were an unsigned
+      computation. That leads to more equalities being detected and results in:
+
        1) More accurate cost can be computed for address expressions;
        2) Duplicate candidates won't be created for bases in different
-         forms, like &a[0] and &a.  */
+         forms, like &a[0] and &a.
+       3) Duplicate candidates won't be created for IV expressions that differ
+         only in their sign.  */
+  aff_tree comb;
   STRIP_NOPS (expr);
-  if ((TREE_CODE (expr) == ADDR_EXPR && !DECL_P (TREE_OPERAND (expr, 0)))
-      || contain_complex_addr_expr (expr))
-    {
-      aff_tree comb;
-      tree_to_aff_combination (expr, TREE_TYPE (expr), &comb);
-      base = fold_convert (TREE_TYPE (base), aff_combination_to_tree (&comb));
-    }
+  expr = fold_convert (unsigned_type_for (TREE_TYPE (expr)), expr);
+  tree_to_aff_combination (expr, TREE_TYPE (expr), &comb);
+  base = fold_convert (TREE_TYPE (base), aff_combination_to_tree (&comb));
 
   iv->base = base;
   iv->base_object = determine_base_object (data, base);

Reply via email to