Hi Tamar,

Sorry for the delay.

> -----Original Message-----
> From: Tamar Christina <tamar.christ...@arm.com>
> Sent: Tuesday, January 11, 2022 7:10 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <n...@arm.com>; Ramana Radhakrishnan
> <ramana.radhakrish...@arm.com>; Richard Earnshaw
> <richard.earns...@arm.com>; ni...@redhat.com; Kyrylo Tkachov
> <kyrylo.tkac...@arm.com>
> Subject: RE: [AArch32]: correct dot-product RTL patterns.
> 
> ping
> 
> > -----Original Message-----
> > From: Tamar Christina
> > Sent: Tuesday, December 21, 2021 12:31 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <n...@arm.com>; Ramana Radhakrishnan
> > <ramana.radhakrish...@arm.com>; Richard Earnshaw
> > <richard.earns...@arm.com>; ni...@redhat.com; Kyrylo Tkachov
> > <kyrylo.tkac...@arm.com>
> > Subject: [AArch32]: correct dot-product RTL patterns.
> >
> > Hi All,
> >
> > The previous fix for this problem was wrong due to a subtle difference
> > between where NEON expects the RMW values and where intrinsics
> expects
> > them.
> >
> > The insn pattern is modeled after the intrinsics and so needs an expand for
> > the vectorizer optab to switch the RTL.
> >
> > However operand[3] is not expected to be written to so the current pattern
> > is bogus.
> >
> > Instead we use the expand to shuffle around the RTL.
> >
> > The vectorizer expects operands[3] and operands[0] to be the same but the
> > aarch64 intrinsics expanders expect operands[0] and operands[1] to be the
> > same.
> >
> > This also fixes some issues with big-endian, each dot product performs 4 8-
> > byte multiplications.  However compared to AArch64 we don't enter lanes
> in
> > GCC lane indexed in AArch32 aside from loads/stores.  This means no lane
> > remappings are done in arm-builtins.c and so none should be done at the
> > instruction side.
> >
> > There are some other instructions that need inspections as I think there are
> > more incorrect ones.
> >
> > Third there was a bug in the ACLE specication for dot product which has
> now
> > been fixed[1].  This means some intrinsics were missing and are added by
> > this patch.
> >
> > Bootstrapped and regtested on arm-none-linux-gnueabihf and no issues.
> >
> > Ok for master? and active branches after some stew?
> >
> > [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >     * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32,
> >     vdot_laneq_s32, vdotq_laneq_s32): New.
> >     * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq: New.
> >     * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New.
> >     (<sup>dot_prod<vsi2qi>): Re-order rtl.
> >     (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes.
> >     (neon_<sup>dot_laneq<vsi2qi>): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> >     * gcc.target/arm/simd/vdot-compile.c: Add new cases.
> >     * gcc.target/arm/simd/vdot-exec.c: Likewise.
> >

Ok but...

> > --- inline copy of patch --
> > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index
> >
> 3364b37f69dfc33082388246c03149d9ad66a634..af6ac63dc3b47830d92f199d
> 93
> > 153ff510f658e9 100644
> > --- a/gcc/config/arm/arm_neon.h
> > +++ b/gcc/config/arm/arm_neon.h
> > @@ -18243,6 +18243,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t
> __a,
> > int8x8_t __b, const int __index)
> >    return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);  }
> >
> > +__extension__ extern __inline uint32x2_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const
> > +int __index) {
> > +  return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
> > +}
> > +
> > +__extension__ extern __inline uint32x4_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
> > +           const int __index)
> > +{
> > +  return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
> > +}
> > +
> > +__extension__ extern __inline int32x2_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int
> > +__index) {
> > +  return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index); }
> > +
> > +__extension__ extern __inline int32x4_t __attribute__
> > +((__always_inline__, __gnu_inline__, __artificial__))
> > +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int
> > +__index) {
> > +  return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index); }
> > +
> >  #pragma GCC pop_options
> >  #endif
> >
> > diff --git a/gcc/config/arm/arm_neon_builtins.def
> > b/gcc/config/arm/arm_neon_builtins.def
> > index
> >
> fafb5c6fc51c16679ead1afda7cccfea8264fd15..f83dd4327c16c0af68f72eb6d9c
> a
> > 8cf21e2e56b5 100644
> > --- a/gcc/config/arm/arm_neon_builtins.def
> > +++ b/gcc/config/arm/arm_neon_builtins.def
> > @@ -342,6 +342,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi)
> >  VAR2 (UTERNOP, udot, v8qi, v16qi)
> >  VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
> >  VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
> > +VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
> > +VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
> >
> >  VAR1 (USTERNOP, usdot, v8qi)
> >  VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi) diff --git
> > a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index
> >
> 8b0a396947cc8e7345f178b926128d7224fb218a..848166311b5f82c5facb66e9
> 7c
> > 2260a5aba5d302 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -2866,20 +2866,49 @@ (define_expand "cmul<conj_op><mode>3"
> >  })
> >
> >
> > -;; These instructions map to the __builtins for the Dot Product operations.
> > -(define_insn "neon_<sup>dot<vsi2qi>"
> > +;; These map to the auto-vectorizer Dot Product optab.
> > +;; The auto-vectorizer expects a dot product builtin that also does an
> > +;; accumulation into the provided register.
> > +;; Given the following pattern
> > +;;
> > +;; for (i=0; i<len; i++) {
> > +;;     c = a[i] * b[i];
> > +;;     r += c;
> > +;; }
> > +;; return result;
> > +;;
> > +;; This can be auto-vectorized to
> > +;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; ;; ;; given
> > +enough iterations.  However the vectorizer can keep unrolling the loop
> > +;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; ;; r +=
> > +a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11]; ;; ...
> > +;;
> > +;; and so the vectorizer provides r, in which the result has to be
> accumulated.
> > +(define_insn "<sup>dot_prod<vsi2qi>"
> >    [(set (match_operand:VCVTI 0 "register_operand" "=w")
> > -   (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
> > -               (unspec:VCVTI [(match_operand:<VSI2QI> 2
> > -                                                   "register_operand"
> > "w")
> > -                              (match_operand:<VSI2QI> 3
> > -                                                   "register_operand"
> > "w")]
> > -           DOTPROD)))]
> > +   (plus:VCVTI
> > +     (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand"
> > "w")
> > +                    (match_operand:<VSI2QI> 2 "register_operand"
> > "w")]
> > +                    DOTPROD)
> > +     (match_operand:VCVTI 3 "register_operand" "0")))]
> >    "TARGET_DOTPROD"
> > -  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
> > +  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
> >    [(set_attr "type" "neon_dot<q>")]
> >  )
> >
> > +;; These instructions map to the __builtins for the Dot Product
> > +operations (define_expand "neon_<sup>dot<vsi2qi>"
> > +  [(set (match_operand:VCVTI 0 "register_operand" "=w")
> > +   (plus:VCVTI
> > +     (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand")
> > +                    (match_operand:<VSI2QI> 3 "register_operand")]
> > +                    DOTPROD)
> > +     (match_operand:VCVTI 1 "register_operand")))]
> > +  "TARGET_DOTPROD"
> > +)
> > +
> >  ;; These instructions map to the __builtins for the Dot Product operations.
> >  (define_insn "neon_usdot<vsi2qi>"
> >    [(set (match_operand:VCVTI 0 "register_operand" "=w") @@ -2898,17
> > +2927,40 @@ (define_insn "neon_usdot<vsi2qi>"
> >  ;; indexed operations.
> >  (define_insn "neon_<sup>dot_lane<vsi2qi>"
> >    [(set (match_operand:VCVTI 0 "register_operand" "=w")
> > -   (plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
> > -               (unspec:VCVTI [(match_operand:<VSI2QI> 2
> > -                                                   "register_operand"
> > "w")
> > -                              (match_operand:V8QI 3 "register_operand"
> > "t")
> > -                              (match_operand:SI 4 "immediate_operand"
> > "i")]
> > -           DOTPROD)))]
> > +   (plus:VCVTI
> > +     (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> > "w")
> > +                    (match_operand:V8QI 3 "register_operand" "t")
> > +                    (match_operand:SI 4 "immediate_operand" "i")]
> > +                    DOTPROD)
> > +     (match_operand:VCVTI 1 "register_operand" "0")))]
> > +  "TARGET_DOTPROD"
> > +  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> > +  [(set_attr "type" "neon_dot<q>")]
> > +)
> > +
> > +;; These instructions map to the __builtins for the Dot Product ;;
> > +indexed operations.
> > +(define_insn "neon_<sup>dot_laneq<vsi2qi>"
> > +  [(set (match_operand:VCVTI 0 "register_operand" "=w")
> > +   (plus:VCVTI
> > +     (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand"
> > "w")
> > +                    (match_operand:V16QI 3 "register_operand" "t")
> > +                    (match_operand:SI 4 "immediate_operand" "i")]
> > +                    DOTPROD)
> > +     (match_operand:VCVTI 1 "register_operand" "0")))]
> >    "TARGET_DOTPROD"
> >    {
> > -    operands[4]
> > -      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL
> > (operands[4])));
> > -    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
> > +    int lane = INTVAL (operands[4]);
> > +    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
> > +      {
> > +   operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
> > +   return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
> > +      }
> > +    else
> > +      {
> > +   operands[4] = GEN_INT (lane);
> > +   return
> > "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
> > +      }
> >    }
> >    [(set_attr "type" "neon_dot<q>")]
> >  )
> > @@ -2932,43 +2984,6 @@ (define_insn "neon_<sup>dot_lane<vsi2qi>"
> >    [(set_attr "type" "neon_dot<q>")]
> >  )
> >
> > -;; These expands map to the Dot Product optab the vectorizer checks for.
> > -;; The auto-vectorizer expects a dot product builtin that also does an -;;
> > accumulation into the provided register.
> > -;; Given the following pattern
> > -;;
> > -;; for (i=0; i<len; i++) {
> > -;;     c = a[i] * b[i];
> > -;;     r += c;
> > -;; }
> > -;; return result;
> > -;;
> > -;; This can be auto-vectorized to
> > -;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3]; -;; -;; given enough
> > iterations.  However the vectorizer can keep unrolling the loop -;; r +=
> > a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7]; -;; r += a[8]*b[8] + 
> > a[9]*b[9] +
> > a[10]*b[10] + a[11]*b[11]; -;; ...
> > -;;
> > -;; and so the vectorizer provides r, in which the result has to be
> accumulated.
> > -(define_expand "<sup>dot_prod<vsi2qi>"
> > -  [(set (match_operand:VCVTI 0 "register_operand")
> > -   (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
> > -                                                   "register_operand")
> > -                              (match_operand:<VSI2QI> 2
> > -                                                   "register_operand")]
> > -                DOTPROD)
> > -               (match_operand:VCVTI 3 "register_operand")))]
> > -  "TARGET_DOTPROD"
> > -{
> > -  emit_insn (
> > -    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
> > -                            operands[2]));
> > -  emit_insn (gen_rtx_SET (operands[0], operands[3]));
> > -  DONE;
> > -})
> > -
> >  ;; Auto-vectorizer pattern for usdot
> >  (define_expand "usdot_prod<vsi2qi>"
> >    [(set (match_operand:VCVTI 0 "register_operand") diff --git
> > a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> > b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> > index
> >
> b3bd3bf00e3822fdd60b5955165583d5a5cdc1d0..d3541e829a44fa07972096a
> 02
> > 226adea1d26f09d 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
> > @@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x,
> int8x8_t
> > y)
> >    return vdotq_lane_s32 (r, x, y, 0);
> >  }
> >
> > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, 
> > d[0-
> > 9]+} 4 } } */
> > +int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y) {
> > +  return vdot_laneq_s32 (r, x, y, 0);
> > +}
> > +
> > +int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y) {
> > +  return vdotq_laneq_s32 (r, x, y, 0);
> > +}
> > +
> > +int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y) {
> > +  return vdot_laneq_s32 (r, x, y, 2);
> > +}
> > +
> > +int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y) {
> > +  return vdotq_laneq_s32 (r, x, y, 2);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+,
> > +d[0-9]+, d[0-9]+} 6 } } */
> >  /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, 
> > q[0-
> > 9]+} 2 } } */
> > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, 
> > d[0-
> > 9]+\[#?[0-9]\]} 2 } } */
> > -/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, 
> > d[0-
> > 9]+\[#?[0-9]\]} 2 } } */
> > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+,
> > +d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
> > +/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+,
> > +q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
> >
> > diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> > b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> > index
> >
> 054f4703394b4184284dac371415bef8e9bac45d..97b7898bd6a0fc9a898eba0
> ea
> > 15fbf38eb1405a3 100644
> > --- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> > +++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
> > @@ -2,6 +2,7 @@
> >  /* { dg-additional-options "-O3" } */
> >  /* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw } */
> >  /* { dg-add-options arm_v8_2a_dotprod_neon }  */
> > +/* { dg-additional-options "-w" } */

... Why is "-w" needed here? Can the test be adjusted to not generate 
unnecessary warnings instead?

Thanks,
Kyrill

> >
> >  #include <arm_neon.h>
> >
> > @@ -33,7 +34,20 @@ extern void abort();
> >     t3 f##_##rx1 = {0};                         \
> >     f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1));  \
> >     if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
> > -     abort (); \
> > +     abort ();
> > +
> > +#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2
> > +#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
> > +   ARR(f, x, t1, r1);                  \
> > +   ARR(f, y, t2, r2);                  \
> > +   t3 f##_##rx = {0};                  \
> > +   f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2));  \
> > +   if (f##_##rx[0] != n1 || f##_##rx[1] != n2)   \
> > +     abort ();                                 \
> > +   t3 f##_##rx1 = {0};                         \
> > +   f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3));  \
> > +   if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
> > +     abort ();
> >
> >  int
> >  main()
> > @@ -45,11 +59,16 @@ main()
> >    TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, 
> > -24);
> >
> >    TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2),
> P(2,3),
> > 8, 16, 12, 24);
> > -
> >    TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), 
> > P(-2,-3), -
> 8,
> > -16, -12, -24);
> >
> >    TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2),
> > P(2,3), 8, 16, 12, 24);
> >    TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-
> 3),
> > -8, -16, -12, -24);
> >
> > +  TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32,
> > + P2(1,2), P2(2,3), 8, 16, 12, 24);  TEST_LANEQ (int8x8_t, int8x16_t,
> > + int32x2_t, vdot_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
> > +
> > +  TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32,
> > + P2(1,2), P2(2,3), 8, 16, 12, 24);  TEST_LANEQ (int8x16_t, int8x16_t,
> > + int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
> > +
> >    return 0;
> >  }
> >
> >
> > --

Reply via email to