Hi,

"Kewen.Lin" <li...@linux.ibm.com> writes:

> Hi,
>
> on 2023/10/25 10:00, Jiufu Guo wrote:
>> Hi,
>> 
>> Sometimes, a complicated constant is built via 3(or more)
>> instructions to build. Generally speaking, it would not be
>> as faster as loading it from the constant pool (as a few
>> discussions in PR63281).
>
> I may miss some previous discussions, but I'm curious why we
> chose ">=3" here, as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c9
> which indicates that more than 3 (>3) should be considered
> with this change.

Thanks a lot for your great patience for reading the history!
Yes, there are some discussions about "> 3" vs. "> 2".
- In theory, "ld" is one instruction.  If consider "address/toc"
  adjust, we may count it as 2 instructions. "pld" may need less
  cycles.
- As test, it seems "> 2" could get better/stable runtime result
  during testing SPEC2017.

>
>> 
>> For the concern that I raised in:
>> https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html
>> The micro-cases would not be the major concern. Because as
>> Segher explained in:
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18
>> It would just be about the benchmark method.
>> 
>> As tested on spec2017, for visible performance changes, we
>> can find the runtime improvement on 500.perlbench_r about
>> ~1.8% (-O2) when support loading complicates constant from
>> constant pool. And no visible performance recession on
>> other benchmarks.
>
> The improvement on 500.perlbench_r looks to match what PR63281
> mentioned, nice!  I'm curious that which options and which kinds
> of CPUs have you tested with?  Since this is a general change,
> I'd expect we can test with P8/P9/P10 at O2/O3 (or Ofast) at
> least.

Great advice! Thanks for pointing this!
A few months ago, P8/P9/P10 are tested. While this time, I rerun
SPEC2017 on P10 for -O2 and -O3.  More test on latest code would
be better.


BR,
Jeff (Jiufu Guo)

>
> BR,
> Kewen
>
>> 
>> Boostrap & regtest pass on ppc64{,le}.
>> Is this ok for trunk?
>> 
>> BR,
>> Jeff (Jiufu Guo)
>> 
>>      PR target/63281
>> 
>> gcc/ChangeLog:
>> 
>>      * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split
>>      complicate constant to memory.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>      * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. 
>>      * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test
>>      point.
>>      * gcc.target/powerpc/pr106550.c: Likewise..
>>      * gcc.target/powerpc/pr106550_1.c: Likewise.
>>      * gcc.target/powerpc/pr87870.c: Update according to latest behavior.
>>      * gcc.target/powerpc/pr93012.c: Likewise.
>> 
>> ---
>>  gcc/config/rs6000/rs6000.cc                     | 16 ++++++++++++++++
>>  .../gcc.target/powerpc/const_anchors.c          |  5 ++---
>>  .../gcc.target/powerpc/parall_5insn_const.c     | 14 ++++++++++++--
>>  gcc/testsuite/gcc.target/powerpc/pr106550.c     | 17 +++++++++++++++--
>>  gcc/testsuite/gcc.target/powerpc/pr106550_1.c   | 15 +++++++++++++--
>>  gcc/testsuite/gcc.target/powerpc/pr87870.c      |  5 ++++-
>>  gcc/testsuite/gcc.target/powerpc/pr93012.c      |  4 +++-
>>  7 files changed, 65 insertions(+), 11 deletions(-)
>> 
>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>> index 4690384cdbe..b9562f1ea0f 100644
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source)
>>        c = sext_hwi (c, 32);
>>        emit_move_insn (lo, GEN_INT (c));
>>      }
>> +
>> +      /* If it can be stored to the constant pool and profitable.  */
>> +      else if (base_reg_operand (dest, mode)
>> +           && num_insns_constant (source, mode) > 2)
>> +    {
>> +      rtx sym = force_const_mem (mode, source);
>> +      if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0))
>> +          && use_toc_relative_ref (XEXP (sym, 0), mode))
>> +        {
>> +          rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest));
>> +          sym = gen_const_mem (mode, toc);
>> +          set_mem_alias_set (sym, get_TOC_alias_set ());
>> +        }
>> +
>> +      emit_insn (gen_rtx_SET (dest, sym));
>> +    }
>>        else
>>      rs6000_emit_set_long_const (dest, c);
>>        break;
>> diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c 
>> b/gcc/testsuite/gcc.target/powerpc/const_anchors.c
>> index 542e2674b12..188744165f2 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c
>> @@ -1,5 +1,5 @@
>>  /* { dg-do compile { target has_arch_ppc64 } } */
>> -/* { dg-options "-O2" } */
>> +/* { dg-options "-O2 -fdump-rtl-final" } */
>>  
>>  #define C1 0x2351847027482577ULL
>>  #define C2 0x2351847027482578ULL
>> @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long 
>> long b)
>>    if (b)
>>      *a++ = C2;
>>  }
>> -
>> -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */
>> +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c 
>> b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> index e3a9a7264cf..df0690b90be 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> @@ -9,8 +9,18 @@
>>  void __attribute__ ((noinline)) foo (unsigned long long *a)
>>  {
>>    /* 2 lis + 2 ori + 1 rldimi for each constant.  */
>> -  *a++ = 0x800aabcdc167fa16ULL;
>> -  *a++ = 0x7543a876867f616ULL;
>> +  {
>> +    register long long d asm("r0") = 0x800aabcdc167fa16ULL;
>> +    long long n;
>> +    asm("mr %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>> +  {
>> +    register long long d asm("r0") = 0x7543a876867f616ULL;
>> +    long long n;
>> +    asm("mr %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>>  }
>>  
>>  long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL};
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr106550.c
>> index 74e395331ab..5eca2b2f701 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c
>> @@ -1,12 +1,25 @@
>>  /* PR target/106550 */
>>  /* { dg-options "-O2 -mdejagnu-cpu=power10" } */
>>  /* { dg-require-effective-target power10_ok } */
>> +/* { dg-require-effective-target has_arch_ppc64 } */
>>  
>>  void
>>  foo (unsigned long long *a)
>>  {
>> -  *a++ = 0x020805006106003; /* pli+pli+rldimi */
>> -  *a++ = 0x2351847027482577;/* pli+pli+rldimi */  
>> +  {
>> +    /* pli+pli+rldimi */
>> +    register long long d asm("r0") = 0x020805006106003ULL;
>> +    long long n;
>> +    asm("mr %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>> +  {
>> +    /* pli+pli+rldimi */  
>> +    register long long d asm("r0") = 0x2351847027482577ULL;
>> +    long long n;
>> +    asm("mr %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>>  }
>>  
>>  /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c
>> index 7e709fcf9d8..11878d893a4 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c
>> @@ -12,8 +12,19 @@ foo (unsigned long long *a)
>>    asm("cntlzd %0, %1" : "=r"(n) : "r"(d));
>>    *a++ = n;
>>  
>> -  *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */
>> -  *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */
>> +  {
>> +    register long long d asm("r0") = 0x235a8470a7480000ULL; /* 
>> pli+sldi+oris */
>> +    long long n;
>> +    asm("cntlzd %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>> +
>> +  {
>> +    register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori 
>> */
>> +    long long n;
>> +    asm("cntlzd %0, %1" : "=r"(n) : "r"(d));
>> +    *a++ = n;
>> +  }
>>  }
>>  
>>  /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr87870.c
>> index d2108ac3386..5fee06744ae 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c
>> @@ -25,4 +25,7 @@ test3 (void)
>>    return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad;
>>  }
>>  
>> -/* { dg-final { scan-assembler-not {\mld\M} } } */
>> +/* test3 using "ld" to load the value for r3 and r4.
>> +   test0, test1 and test2 are using "li".  */
>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */
>> +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr93012.c
>> index 4f764d0576f..87ce0b49b23 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c
>> @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; 
>> }
>>  unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; }
>>  unsigned long long mskse() { return 0xffff1234ffff1234ULL; }
>>  
>> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */
>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { 
>> has_arch_pwr10 } } } } */
>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! 
>> has_arch_pwr10 } } } } */
>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! 
>> has_arch_pwr10 } } } } */

Reply via email to