On Aug 28, 2017, at 3:56 PM, Bill Schmidt <wschm...@linux.vnet.ibm.com> wrote: > > Hi, > > PR81833 identifies a problem with the little-endian vector multiply-sum > instructions. The original implementation is quite poor (and I am allowed > to say that, since it was mine). This patch fixes the code properly. > > The revised code still uses UNSPECs for these ops, which is not strictly > necessary, although descriptive rtl for them would be pretty complex. I've > put in a FIXME to make note of that for a future cleanup. > > Bootstrapped and tested on powerpc64le-linux-gnu with no regressions. I am > currently testing on powerpc64-linux-gnu for 32- and 64-bit. Provided that > testing succeeds, is this ok for trunk, and for eventual backport to all > supported releases?
FYI, big-endian tests have completed successfully. Bill > > Thanks, > Bill > > > [gcc] > > 2017-08-28 Bill Schmidt <wschm...@linux.vnet.ibm.com> > > PR target/81833 > * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a > define_insn to a define_expand. > (altivec_vsum2sws_direct): New define_insn. > (altivec_vsumsws): Convert from a define_insn to a define_expand. > > [gcc/testsuite] > > 2017-08-28 Bill Schmidt <wschm...@linux.vnet.ibm.com> > > PR target/81833 > * gcc.target/powerpc/pr81833.c: New file. > > > Index: gcc/config/rs6000/altivec.md > =================================================================== > --- gcc/config/rs6000/altivec.md (revision 251369) > +++ gcc/config/rs6000/altivec.md (working copy) > @@ -1804,51 +1804,61 @@ > "vsum4s<VI_char>s %0,%1,%2" > [(set_attr "type" "veccomplex")]) > > -;; FIXME: For the following two patterns, the scratch should only be > -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should > -;; be emitted separately. > -(define_insn "altivec_vsum2sws" > - [(set (match_operand:V4SI 0 "register_operand" "=v") > - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > - (match_operand:V4SI 2 "register_operand" "v")] > - UNSPEC_VSUM2SWS)) > - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) > - (clobber (match_scratch:V4SI 3 "=v"))] > +(define_expand "altivec_vsum2sws" > + [(use (match_operand:V4SI 0 "register_operand")) > + (use (match_operand:V4SI 1 "register_operand")) > + (use (match_operand:V4SI 2 "register_operand"))] > "TARGET_ALTIVEC" > { > if (VECTOR_ELT_ORDER_BIG) > - return "vsum2sws %0,%1,%2"; > + emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1], > + operands[2])); > else > - return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4"; > -} > - [(set_attr "type" "veccomplex") > - (set (attr "length") > - (if_then_else > - (match_test "VECTOR_ELT_ORDER_BIG") > - (const_string "4") > - (const_string "12")))]) > + { > + rtx tmp1 = gen_reg_rtx (V4SImode); > + rtx tmp2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2], > + operands[2], GEN_INT (12))); > + emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1)); > + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, > + GEN_INT (4))); > + } > + DONE; > +}) > > -(define_insn "altivec_vsumsws" > +; FIXME: This can probably be expressed without an UNSPEC. > +(define_insn "altivec_vsum2sws_direct" > [(set (match_operand:V4SI 0 "register_operand" "=v") > (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > - (match_operand:V4SI 2 "register_operand" "v")] > - UNSPEC_VSUMSWS)) > - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) > - (clobber (match_scratch:V4SI 3 "=v"))] > + (match_operand:V4SI 2 "register_operand" "v")] > + UNSPEC_VSUM2SWS))] > "TARGET_ALTIVEC" > + "vsum2sws %0,%1,%2" > + [(set_attr "type" "veccomplex") > + (set_attr "length" "4")]) > + > +(define_expand "altivec_vsumsws" > + [(use (match_operand:V4SI 0 "register_operand")) > + (use (match_operand:V4SI 1 "register_operand")) > + (use (match_operand:V4SI 2 "register_operand"))] > + "TARGET_ALTIVEC" > { > if (VECTOR_ELT_ORDER_BIG) > - return "vsumsws %0,%1,%2"; > + emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1], > + operands[2])); > else > - return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12"; > -} > - [(set_attr "type" "veccomplex") > - (set (attr "length") > - (if_then_else > - (match_test "(VECTOR_ELT_ORDER_BIG)") > - (const_string "4") > - (const_string "12")))]) > + { > + rtx tmp1 = gen_reg_rtx (V4SImode); > + rtx tmp2 = gen_reg_rtx (V4SImode); > + emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx)); > + emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1)); > + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, > + GEN_INT (12))); > + } > + DONE; > +}) > > +; FIXME: This can probably be expressed without an UNSPEC. > (define_insn "altivec_vsumsws_direct" > [(set (match_operand:V4SI 0 "register_operand" "=v") > (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") > Index: gcc/testsuite/gcc.target/powerpc/pr81833.c > =================================================================== > --- gcc/testsuite/gcc.target/powerpc/pr81833.c (nonexistent) > +++ gcc/testsuite/gcc.target/powerpc/pr81833.c (working copy) > @@ -0,0 +1,54 @@ > +/* PR81833: This used to fail due to improper implementation of vec_msum. */ > + > +/* { dg-do run {target { lp64 } } } */ > +/* { dg-require-effective-target powerpc_altivec_ok } */ > + > +#include <altivec.h> > + > +#define vec_u8 vector unsigned char > +#define vec_s8 vector signed char > +#define vec_u16 vector unsigned short > +#define vec_s16 vector signed short > +#define vec_u32 vector unsigned int > +#define vec_s32 vector signed int > +#define vec_f vector float > + > +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0) > + > +#define zero_u8v (vec_u8) zerov > +#define zero_s8v (vec_s8) zerov > +#define zero_u16v (vec_u16) zerov > +#define zero_s16v (vec_s16) zerov > +#define zero_u32v (vec_u32) zerov > +#define zero_s32v (vec_s32) zerov > + > +signed int __attribute__((noinline)) > +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, > + int order) > +{ > + int i; > + LOAD_ZERO; > + register vec_s16 vec1; > + register vec_s32 res = vec_splat_s32 (0), t; > + signed int ires; > + > + for (i = 0; i < order; i += 8) { > + vec1 = vec_vsx_ld (0, v1); > + t = vec_msum (vec1, vec_ld (0, v2), zero_s32v); > + res = vec_sums (t, res); > + v1 += 8; > + v2 += 8; > + } > + res = vec_splat (res, 3); > + vec_ste (res, 0, &ires); > + > + return ires; > +} > + > +int main(void) > +{ > + const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 }; > + if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8) > + __builtin_abort (); > + return 0; > +} >