Re: [PATCH, rs6000] Fix PR81833 (incorrect code gen for vec_msum)

Bill Schmidt Mon, 28 Aug 2017 17:39:22 -0700

On Aug 28, 2017, at 3:56 PM, Bill Schmidt <wschm...@linux.vnet.ibm.com> wrote:
> 
> Hi, 
> 
> PR81833 identifies a problem with the little-endian vector multiply-sum
> instructions.  The original implementation is quite poor (and I am allowed
> to say that, since it was mine).  This patch fixes the code properly.
> 
> The revised code still uses UNSPECs for these ops, which is not strictly
> necessary, although descriptive rtl for them would be pretty complex.  I've
> put in a FIXME to make note of that for a future cleanup.
> 
> Bootstrapped and tested on powerpc64le-linux-gnu with no regressions.  I am
> currently testing on powerpc64-linux-gnu for 32- and 64-bit.  Provided that
> testing succeeds, is this ok for trunk, and for eventual backport to all
> supported releases?


FYI, big-endian tests have completed successfully.

Bill
> 
> Thanks,
> Bill
> 
> 
> [gcc]
> 
> 2017-08-28  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
> 
>       PR target/81833
>       * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
>       define_insn to a define_expand.
>       (altivec_vsum2sws_direct): New define_insn.
>       (altivec_vsumsws): Convert from a define_insn to a define_expand.
> 
> [gcc/testsuite]
> 
> 2017-08-28  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
> 
>       PR target/81833
>       * gcc.target/powerpc/pr81833.c: New file.
> 
> 
> Index: gcc/config/rs6000/altivec.md
> ===================================================================
> --- gcc/config/rs6000/altivec.md      (revision 251369)
> +++ gcc/config/rs6000/altivec.md      (working copy)
> @@ -1804,51 +1804,61 @@
>   "vsum4s<VI_char>s %0,%1,%2"
>   [(set_attr "type" "veccomplex")])
> 
> -;; FIXME: For the following two patterns, the scratch should only be
> -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
> -;; be emitted separately.
> -(define_insn "altivec_vsum2sws"
> -  [(set (match_operand:V4SI 0 "register_operand" "=v")
> -        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> -                      (match_operand:V4SI 2 "register_operand" "v")]
> -                  UNSPEC_VSUM2SWS))
> -   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
> -   (clobber (match_scratch:V4SI 3 "=v"))]
> +(define_expand "altivec_vsum2sws"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V4SI 1 "register_operand"))
> +   (use (match_operand:V4SI 2 "register_operand"))]
>   "TARGET_ALTIVEC"
> {
>   if (VECTOR_ELT_ORDER_BIG)
> -    return "vsum2sws %0,%1,%2";
> +    emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
> +                                            operands[2]));
>   else
> -    return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
> -}
> -  [(set_attr "type" "veccomplex")
> -   (set (attr "length")
> -     (if_then_else
> -       (match_test "VECTOR_ELT_ORDER_BIG")
> -       (const_string "4")
> -       (const_string "12")))])
> +    {
> +      rtx tmp1 = gen_reg_rtx (V4SImode);
> +      rtx tmp2 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
> +                                          operands[2], GEN_INT (12)));
> +      emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
> +      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
> +                                          GEN_INT (4)));
> +    }
> +  DONE;
> +})
> 
> -(define_insn "altivec_vsumsws"
> +; FIXME: This can probably be expressed without an UNSPEC.
> +(define_insn "altivec_vsum2sws_direct"
>   [(set (match_operand:V4SI 0 "register_operand" "=v")
>         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> -                      (match_operand:V4SI 2 "register_operand" "v")]
> -                  UNSPEC_VSUMSWS))
> -   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
> -   (clobber (match_scratch:V4SI 3 "=v"))]
> +                   (match_operand:V4SI 2 "register_operand" "v")]
> +                  UNSPEC_VSUM2SWS))]
>   "TARGET_ALTIVEC"
> +  "vsum2sws %0,%1,%2"
> +  [(set_attr "type" "veccomplex")
> +   (set_attr "length" "4")])
> +
> +(define_expand "altivec_vsumsws"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V4SI 1 "register_operand"))
> +   (use (match_operand:V4SI 2 "register_operand"))]
> +  "TARGET_ALTIVEC"
> {
>   if (VECTOR_ELT_ORDER_BIG)
> -    return "vsumsws %0,%1,%2";
> +    emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
> +                                           operands[2]));
>   else
> -    return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
> -}
> -  [(set_attr "type" "veccomplex")
> -   (set (attr "length")
> -     (if_then_else
> -       (match_test "(VECTOR_ELT_ORDER_BIG)")
> -       (const_string "4")
> -       (const_string "12")))])
> +    {
> +      rtx tmp1 = gen_reg_rtx (V4SImode);
> +      rtx tmp2 = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
> +      emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
> +      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
> +                                          GEN_INT (12)));
> +    }
> +  DONE;
> +})
> 
> +; FIXME: This can probably be expressed without an UNSPEC.
> (define_insn "altivec_vsumsws_direct"
>   [(set (match_operand:V4SI 0 "register_operand" "=v")
>         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
> Index: gcc/testsuite/gcc.target/powerpc/pr81833.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/pr81833.c        (nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/pr81833.c        (working copy)
> @@ -0,0 +1,54 @@
> +/* PR81833: This used to fail due to improper implementation of vec_msum.  */
> +
> +/* { dg-do run {target { lp64 } } } */
> +/* { dg-require-effective-target powerpc_altivec_ok } */
> +
> +#include <altivec.h>
> +
> +#define vec_u8  vector unsigned char
> +#define vec_s8  vector signed char
> +#define vec_u16 vector unsigned short
> +#define vec_s16 vector signed short
> +#define vec_u32 vector unsigned int
> +#define vec_s32 vector signed int
> +#define vec_f   vector float
> +
> +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
> +
> +#define zero_u8v  (vec_u8)  zerov
> +#define zero_s8v  (vec_s8)  zerov
> +#define zero_u16v (vec_u16) zerov
> +#define zero_s16v (vec_s16) zerov
> +#define zero_u32v (vec_u32) zerov
> +#define zero_s32v (vec_s32) zerov
> +
> +signed int __attribute__((noinline))
> +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
> +                      int order)
> +{
> +  int i;
> +  LOAD_ZERO;
> +  register vec_s16 vec1;
> +  register vec_s32 res = vec_splat_s32 (0), t;
> +  signed int ires;
> +
> +  for (i = 0; i < order; i += 8) {
> +    vec1 = vec_vsx_ld (0, v1);
> +    t    = vec_msum (vec1, vec_ld (0, v2), zero_s32v);
> +    res  = vec_sums (t, res);
> +    v1  += 8;
> +    v2  += 8;
> +  }
> +  res = vec_splat (res, 3);
> +  vec_ste (res, 0, &ires);
> +
> +  return ires;
> +}
> +
> +int main(void)
> +{
> +  const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
> +  if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
> +    __builtin_abort ();
> +  return 0;
> +}
>

Re: [PATCH, rs6000] Fix PR81833 (incorrect code gen for vec_msum)

Reply via email to