On Mon, Aug 15, 2022 at 7:26 AM Jiufu Guo via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> This patch tries to put the constant into constant pool if building the
> constant requires 3 or more instructions.
>
> But there is a concern: I'm wondering if this patch is really profitable.
>
> Because, as I tested, 1. for simple case, if instructions are not been run
> in parallel, loading constant from memory maybe faster; but 2. if there
> are some instructions could run in parallel, loading constant from memory
> are not win comparing with building constant.  As below examples.
>
> For f1.c and f3.c, 'loading' constant would be acceptable in runtime aspect;
> for f2.c and f4.c, 'loading' constant are visibly slower.
>
> For real-world cases, both kinds of code sequences exist.
>
> So, I'm not sure if we need to push this patch.
>
> Run a lot of times (1000000000) below functions to check runtime.
> f1.c:
> long foo (long *arg, long*, long *)
> {
>   *arg = 0x1234567800000000;
> }
> asm building constant:
>         lis 10,0x1234
>         ori 10,10,0x5678
>         sldi 10,10,32
> vs.  asm loading
>         addis 10,2,.LC0@toc@ha
>         ld 10,.LC0@toc@l(10)
> The runtime between 'building' and 'loading' are similar: some times the
> 'building' is faster; sometimes 'loading' is faster. And the difference is
> slight.

I wonder if it is possible to decide this during scheduling - chose the
variant that, when the result is needed, is cheaper?  Post-RA might
be a bit difficult (I see the load from memory needs the TOC, but then
when the TOC is not available we could just always emit the build form),
and pre-reload precision might be not good enough to make this worth
the experiment?

Of course the scheduler might lack on the technical side as well.

>
> f2.c
> long foo (long *arg, long *arg2, long *arg3)
> {
>   *arg = 0x1234567800000000;
>   *arg2 = 0x7965234700000000;
>   *arg3 = 0x4689123700000000;
> }
> asm building constant:
>         lis 7,0x1234
>         lis 10,0x7965
>         lis 9,0x4689
>         ori 7,7,0x5678
>         ori 10,10,0x2347
>         ori 9,9,0x1237
>         sldi 7,7,32
>         sldi 10,10,32
>         sldi 9,9,32
> vs. loading
>         addis 7,2,.LC0@toc@ha
>         addis 10,2,.LC1@toc@ha
>         addis 9,2,.LC2@toc@ha
>         ld 7,.LC0@toc@l(7)
>         ld 10,.LC1@toc@l(10)
>         ld 9,.LC2@toc@l(9)
> For this case, 'loading' is always slower than 'building' (>15%).
>
> f3.c
> long foo (long *arg, long *, long *)
> {
>   *arg = 384307168202282325;
> }
>         lis 10,0x555
>         ori 10,10,0x5555
>         sldi 10,10,32
>         oris 10,10,0x5555
>         ori 10,10,0x5555
> For this case, 'building' (through 5 instructions) are slower, and 'loading'
> is faster ~5%;
>
> f4.c
> long foo (long *arg, long *arg2, long *arg3)
> {
>   *arg = 384307168202282325;
>   *arg2 = -6148914691236517205;
>   *arg3 = 768614336404564651;
> }
>         lis 7,0x555
>         lis 10,0xaaaa
>         lis 9,0xaaa
>         ori 7,7,0x5555
>         ori 10,10,0xaaaa
>         ori 9,9,0xaaaa
>         sldi 7,7,32
>         sldi 10,10,32
>         sldi 9,9,32
>         oris 7,7,0x5555
>         oris 10,10,0xaaaa
>         oris 9,9,0xaaaa
>         ori 7,7,0x5555
>         ori 10,10,0xaaab
>         ori 9,9,0xaaab
> For this cases, since 'building' constant are parallel, 'loading' is slower:
> ~8%. On p10, 'loading'(through 'pld') is also slower >4%.
>
>
> BR,
> Jeff(Jiufu)
>
> ---
>  gcc/config/rs6000/rs6000.cc                | 14 ++++++++++++++
>  gcc/testsuite/gcc.target/powerpc/pr63281.c | 11 +++++++++++
>  2 files changed, 25 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr63281.c
>
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 4b727d2a500..3798e11bdbc 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -10098,6 +10098,20 @@ rs6000_emit_set_const (rtx dest, rtx source)
>           c = ((c & 0xffffffff) ^ 0x80000000) - 0x80000000;
>           emit_move_insn (lo, GEN_INT (c));
>         }
> +      else if (base_reg_operand (dest, mode)
> +              && num_insns_constant (source, mode) > 2)
> +       {
> +         rtx sym = force_const_mem (mode, source);
> +         if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0))
> +             && use_toc_relative_ref (XEXP (sym, 0), mode))
> +           {
> +             rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest));
> +             sym = gen_const_mem (mode, toc);
> +             set_mem_alias_set (sym, get_TOC_alias_set ());
> +           }
> +
> +         emit_insn (gen_rtx_SET (dest, sym));
> +       }
>        else
>         rs6000_emit_set_long_const (dest, c);
>        break;
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr63281.c 
> b/gcc/testsuite/gcc.target/powerpc/pr63281.c
> new file mode 100644
> index 00000000000..469a8f64400
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr63281.c
> @@ -0,0 +1,11 @@
> +/* PR target/63281 */
> +/* { dg-do compile { target lp64 } } */
> +/* { dg-options "-O2 -std=c99" } */
> +
> +void
> +foo (unsigned long long *a)
> +{
> +  *a = 0x020805006106003;
> +}
> +
> +/* { dg-final { scan-assembler-times {\mp?ld\M} 1 } } */
> --
> 2.17.1
>

Reply via email to