Re: [PATCH v4] LoongArch: Optimize immediate load.

Xi Ruoyao via Gcc-patches Tue, 22 Nov 2022 06:03:42 -0800

While I still can't fully understand the immediate load issue and how
this patch fix it, I've tested this patch (alongside the prefetch
instruction patch) with bootstrap-ubsan.  And the compiled result of
imm-load1.c seems OK.


On Thu, 2022-11-17 at 17:59 +0800, Lulu Cheng wrote:
> v1 -> v2:
> 1. Change the code format.
> 2. Fix bugs in the code.
> 
> v2 -> v3:
> Modifying a code implementation of an undefined behavior.
> 
> v3 -> v4:
> Move the part of the immediate number decomposition from expand pass
> to split
> pass.
> 
> Both regression tests and spec2006 passed.
> 
> The problem mentioned in the link does not move the four immediate
> load
> instructions out of the loop. It has been optimized. Now, as in the
> test case,
> four immediate load instructions are generated outside the loop.
> (
> https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)
> 
> --------------------------------------------------------------------
> Because loop2_invariant pass will extract the instructions that do not
> change
> in the loop out of the loop, some instructions will not meet the
> extraction
> conditions if the machine performs immediate decomposition while
> expand pass,
> so the immediate decomposition will be transferred to the split
> process.
> 
> gcc/ChangeLog:
> 
>         * config/loongarch/loongarch.cc (enum
> loongarch_load_imm_method):
>         Remove the member METHOD_INSV that is not currently used.
>         (struct loongarch_integer_op): Define a new member curr_value,
>         that records the value of the number stored in the destination
>         register immediately after the current instruction has run.
>         (loongarch_build_integer): Assign a value to the curr_value
> member variable.
>         (loongarch_move_integer): Adds information for the immediate
> load instruction.
>         * config/loongarch/loongarch.md (*movdi_32bit): Redefine as
> define_insn_and_split.
>         (*movdi_64bit): Likewise.
>         (*movsi_internal): Likewise.
>         (*movhi_internal): Likewise.
>         * config/loongarch/predicates.md: Return true as long as it is
> CONST_INT, ensure
>         that the immediate number is not optimized by decomposition
> during expand
>         optimization loop.
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/loongarch/imm-load.c: New test.
>         * gcc.target/loongarch/imm-load1.c: New test.
> ---
>  gcc/config/loongarch/loongarch.cc             | 62 ++++++++++--------
> -
>  gcc/config/loongarch/loongarch.md             | 44 +++++++++++--
>  gcc/config/loongarch/predicates.md            |  2 +-
>  gcc/testsuite/gcc.target/loongarch/imm-load.c | 10 +++
>  .../gcc.target/loongarch/imm-load1.c          | 26 ++++++++
>  5 files changed, 110 insertions(+), 34 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
>  create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load1.c
> 
> diff --git a/gcc/config/loongarch/loongarch.cc
> b/gcc/config/loongarch/loongarch.cc
> index 8ee32c90573..9e0d6c7c3ea 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -139,22 +139,21 @@ struct loongarch_address_info
>  
>     METHOD_LU52I:
>       Load 52-63 bit of the immediate number.
> -
> -   METHOD_INSV:
> -     immediate like 0xfff00000fffffxxx
> -   */
> +*/
>  enum loongarch_load_imm_method
>  {
>    METHOD_NORMAL,
>    METHOD_LU32I,
> -  METHOD_LU52I,
> -  METHOD_INSV
> +  METHOD_LU52I
>  };
>  
>  struct loongarch_integer_op
>  {
>    enum rtx_code code;
>    HOST_WIDE_INT value;
> +  /* Represent the result of the immediate count of the load
> instruction at
> +     each step.  */
> +  HOST_WIDE_INT curr_value;
>    enum loongarch_load_imm_method method;
>  };
>  
> @@ -1475,24 +1474,27 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>      {
>        /* The value of the lower 32 bit be loaded with one
> instruction.
>          lu12i.w.  */
> -      codes[0].code = UNKNOWN;
> -      codes[0].method = METHOD_NORMAL;
> -      codes[0].value = low_part;
> +      codes[cost].code = UNKNOWN;
> +      codes[cost].method = METHOD_NORMAL;
> +      codes[cost].value = low_part;
> +      codes[cost].curr_value = low_part;
>        cost++;
>      }
>    else
>      {
>        /* lu12i.w + ior.  */
> -      codes[0].code = UNKNOWN;
> -      codes[0].method = METHOD_NORMAL;
> -      codes[0].value = low_part & ~(IMM_REACH - 1);
> +      codes[cost].code = UNKNOWN;
> +      codes[cost].method = METHOD_NORMAL;
> +      codes[cost].value = low_part & ~(IMM_REACH - 1);
> +      codes[cost].curr_value = codes[cost].value;
>        cost++;
>        HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
>        if (iorv != 0)
>         {
> -         codes[1].code = IOR;
> -         codes[1].method = METHOD_NORMAL;
> -         codes[1].value = iorv;
> +         codes[cost].code = IOR;
> +         codes[cost].method = METHOD_NORMAL;
> +         codes[cost].value = iorv;
> +         codes[cost].curr_value = low_part;
>           cost++;
>         }
>      }
> @@ -1515,11 +1517,14 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>         {
>           codes[cost].method = METHOD_LU52I;
>           codes[cost].value = value & LU52I_B;
> +         codes[cost].curr_value = value;
>           return cost + 1;
>         }
>  
>        codes[cost].method = METHOD_LU32I;
>        codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
> +      codes[cost].curr_value = (value & 0xfffffffffffff)
> +       | (sign51 ? LU52I_B : 0);
>        cost++;
>  
>        /* Determine whether the 52-61 bits are sign-extended from the
> low order,
> @@ -1528,6 +1533,7 @@ loongarch_build_integer (struct
> loongarch_integer_op *codes,
>         {
>           codes[cost].method = METHOD_LU52I;
>           codes[cost].value = value & LU52I_B;
> +         codes[cost].curr_value = value;
>           cost++;
>         }
>      }
> @@ -2911,6 +2917,9 @@ loongarch_move_integer (rtx temp, rtx dest,
> unsigned HOST_WIDE_INT value)
>        else
>         x = force_reg (mode, x);
>  
> +      set_unique_reg_note (get_last_insn (), REG_EQUAL,
> +                          GEN_INT (codes[i-1].curr_value));
> +
>        switch (codes[i].method)
>         {
>         case METHOD_NORMAL:
> @@ -2918,22 +2927,17 @@ loongarch_move_integer (rtx temp, rtx dest,
> unsigned HOST_WIDE_INT value)
>                               GEN_INT (codes[i].value));
>           break;
>         case METHOD_LU32I:
> -         emit_insn (
> -           gen_rtx_SET (x,
> -                        gen_rtx_IOR (DImode,
> -                                     gen_rtx_ZERO_EXTEND (
> -                                       DImode, gen_rtx_SUBREG
> (SImode, x, 0)),
> -                                     GEN_INT (codes[i].value))));
> +         gcc_assert (mode == DImode);
> +         x = gen_rtx_IOR (DImode,
> +                          gen_rtx_ZERO_EXTEND (DImode,
> +                                               gen_rtx_SUBREG
> (SImode, x, 0)),
> +                          GEN_INT (codes[i].value));
>           break;
>         case METHOD_LU52I:
> -         emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
> -                                 GEN_INT (codes[i].value)));
> -         break;
> -       case METHOD_INSV:
> -         emit_insn (
> -           gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT
> (20),
> -                                              GEN_INT (32)),
> -                        gen_rtx_REG (DImode, 0)));
> +         gcc_assert (mode == DImode);
> +         x = gen_rtx_IOR (DImode,
> +                          gen_rtx_AND (DImode, x, GEN_INT
> (0xfffffffffffff)),
> +                          GEN_INT (codes[i].value));
>           break;
>         default:
>           gcc_unreachable ();
> diff --git a/gcc/config/loongarch/loongarch.md
> b/gcc/config/loongarch/loongarch.md
> index 2fda5381904..f61db66d535 100644
> --- a/gcc/config/loongarch/loongarch.md
> +++ b/gcc/config/loongarch/loongarch.md
> @@ -1718,23 +1718,41 @@ (define_expand "movdi"
>      DONE;
>  })
>  
> -(define_insn "*movdi_32bit"
> +(define_insn_and_split "*movdi_32bit"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m")
>         (match_operand:DI 1 "move_operand" "r,i,w,r,*J*r,*m,*f,*f"))]
>    "!TARGET_64BIT
>     && (register_operand (operands[0], DImode)
>         || reg_or_0_operand (operands[1], DImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>     (set_attr "mode" "DI")])
>  
> -(define_insn "*movdi_64bit"
> +(define_insn_and_split "*movdi_64bit"
>    [(set (match_operand:DI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m")
>         (match_operand:DI 1 "move_operand"
> "r,Yd,w,rJ,*r*J,*m,*f,*f"))]
>    "TARGET_64BIT
>     && (register_operand (operands[0], DImode)
>         || reg_or_0_operand (operands[1], DImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore")
>     (set_attr "mode" "DI")])
>  
> @@ -1749,12 +1767,21 @@ (define_expand "movsi"
>      DONE;
>  })
>  
> -(define_insn "*movsi_internal"
> +(define_insn_and_split "*movsi_internal"
>    [(set (match_operand:SI 0 "nonimmediate_operand"
> "=r,r,r,w,*f,*f,*r,*m,*r,*z")
>         (match_operand:SI 1 "move_operand"
> "r,Yd,w,rJ,*r*J,*m,*f,*f,*z,*r"))]
>    "(register_operand (operands[0], SImode)
>      || reg_or_0_operand (operands[1], SImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type"
> "move,const,load,store,mgtf,fpload,mftg,fpstore,mftg,mgtf")
>     (set_attr "mode" "SI")])
>  
> @@ -1774,12 +1801,21 @@ (define_expand "movhi"
>      DONE;
>  })
>  
> -(define_insn "*movhi_internal"
> +(define_insn_and_split "*movhi_internal"
>    [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,r,m,r,k")
>         (match_operand:HI 1 "move_operand" "r,Yd,I,m,rJ,k,rJ"))]
>    "(register_operand (operands[0], HImode)
>         || reg_or_0_operand (operands[1], HImode))"
>    { return loongarch_output_move (operands[0], operands[1]); }
> +  "CONST_INT_P (operands[1]) && REG_P (operands[0]) && GP_REG_P
> (REGNO
> +  (operands[0]))"
> +  [(const_int 0)]
> +  "
> +{
> +  loongarch_move_integer (operands[0], operands[0], INTVAL
> (operands[1]));
> +  DONE;
> +}
> +  "
>    [(set_attr "move_type" "move,const,const,load,store,load,store")
>     (set_attr "mode" "HI")])
>  
> diff --git a/gcc/config/loongarch/predicates.md
> b/gcc/config/loongarch/predicates.md
> index 8bd0c1376c9..58c3dc2261c 100644
> --- a/gcc/config/loongarch/predicates.md
> +++ b/gcc/config/loongarch/predicates.md
> @@ -226,7 +226,7 @@ (define_predicate "move_operand"
>    switch (GET_CODE (op))
>      {
>      case CONST_INT:
> -      return !splittable_const_int_operand (op, mode);
> +      return true;
>  
>      case CONST:
>      case SYMBOL_REF:
> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c
> b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> new file mode 100644
> index 00000000000..c04ca33996f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-split1" } */
> +
> +long int
> +test (void)
> +{
> +  return 0x1234567890abcdef;
> +}
> +/* { dg-final { scan-rtl-dump-times "scanning new insn with uid" 6
> "split1" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> new file mode 100644
> index 00000000000..2ff02971239
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/imm-load1.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mabi=lp64d -O2" } */
> +/* { dg-final { scan-assembler
> "test:.*lu52i\.d.*\n\taddi\.w.*\n\.L2:" } } */
> +
> +
> +extern long long b[10];
> +static inline long long
> +repeat_bytes (void)
> +{
> +  long long r = 0x0101010101010101;
> +
> +  return r;
> +}
> +
> +static inline long long
> +highbit_mask (long long m)
> +{
> +  return m & repeat_bytes ();
> +}
> +
> +void test(long long *a)
> +{
> +  for (int i = 0; i < 10; i++)
> +    b[i] = highbit_mask (a[i]);
> +
> +}

-- 
Xi Ruoyao <xry...@xry111.site>
School of Aerospace Science and Technology, Xidian University

Re: [PATCH v4] LoongArch: Optimize immediate load.

Reply via email to