Re: [PATCH 7/9]AArch64: Implement widen_[us]sum using dotproduct for SVE [PR122069]

Kyrylo Tkachov Sat, 18 Oct 2025 15:29:09 -0700


> On 3 Oct 2025, at 11:46, Tamar Christina <[email protected]> wrote:
> 
> This patch implements support for using dotproduct to do sum reductions by
> changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
> 
> Given the example
> 
> int foo_int(unsigned char *x, unsigned char * restrict y) {
> int sum = 0;
> for (int i = 0; i < 8000; i++)
>    sum += char_abs(x[i] - y[i]);
> return sum;
> }
> 
> we used to generate
> 
> .L2:
>       ld1b    z1.b, p7/z, [x0, x2]
>       ld1b    z29.b, p7/z, [x1, x2]
>       sub     z29.b, z1.b, z29.b
>       uunpklo z0.h, z29.b
>       uunpkhi z29.h, z29.b
>       uunpklo z30.s, z0.h
>       add     z31.s, p6/m, z31.s, z30.s
>       uunpkhi z0.s, z0.h
>       add     z31.s, p5/m, z31.s, z0.s
>       uunpklo z28.s, z29.h
>       add     z31.s, p4/m, z31.s, z28.s
>       uunpkhi z29.s, z29.h
>       add     z31.s, p3/m, z31.s, z29.s
>       add     x2, x2, x7
>       whilelo p7.b, w2, w3
>       whilelo p3.s, w2, w6
>       whilelo p4.s, w2, w5
>       whilelo p5.s, w2, w4
>       whilelo p6.s, w2, w3
>       b.any   .L2
>       ptrue   p7.b, all
>       uaddv   d31, p7, z31.s
> 
> but now generates with +dotprod
> 
> .L3:
>       ld1b    z30.b, p7/z, [x5, x2]
>       ld1b    z29.b, p7/z, [x1, x2]
>       sub     z30.b, z30.b, z29.b
>       udot    z31.s, z30.b, z28.b
>       mov     x3, x2
>       add     x2, x2, x6
>       cmp     w2, w0
>       bls     .L3
>       incb    x3
>       uaddv   d31, p7, z31.s
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> PR middle-end/122069
> * config/aarch64/aarch64-sve.md (widen_<sur>sum<mode><vsi2qi>3): New.
> 
> gcc/testsuite/ChangeLog:
> 
> PR middle-end/122069
> * gcc.target/aarch64/sve/pr122069_1.c: New test.
> * gcc.target/aarch64/sve/pr122069_2.c: New test.
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 
> 8c47d441c3fd6a70f0d2ef5a26883733a9fd36c1..29ef5cf990573fc9ff4bad8901f8f5004f985f36
>  100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -7722,6 +7722,20 @@ (define_insn 
> "@aarch64_<sur>dot_prod_lane<VNx4SI_ONLY:mode><VNx16QI_ONLY:mode>"
>  [(set_attr "sve_type" "sve_int_dot")]
> )
> 
> +;; Define double widen_[su]sum as dotproduct
> +(define_expand "widen_<sur>sum<mode><vsi2qi>3"
> +  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
> + (plus:SVE_FULL_SDI
> +   (unspec:SVE_FULL_SDI
> +     [(match_operand:<VSI2QI> 1 "register_operand")
> +      (match_dup 3)]
> +     DOTPROD)
> +   (match_operand:SVE_FULL_SDI 2 "register_operand")))]
> +  "TARGET_SVE"
> +{
> +  operands[3] = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
> +})


Small comment explaining what CONST1_RTX is doing here would be good for 
posterity...

> +
> ;; -------------------------------------------------------------------------
> ;; ---- [INT] Sum of absolute differences
> ;; -------------------------------------------------------------------------
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5d1f61f4a6a8d02c190aeb96cb145a3a1ca1cd20
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +/*
> +** foo_int:
> +**  ...
> +**  sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
> +**  udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
> +**  ...
> +*/
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +/* 
> +** foo2_int:
> +**  ...
> +**  add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +**  punpklo p[0-9]+.h, p[0-9]+.b
> +**  uunpklo z[0-9]+.s, z[0-9]+.h
> +**  add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**  punpkhi p[0-9]+.h, p[0-9]+.b
> +**  uunpkhi z[0-9]+.s, z[0-9]+.h
> +**  add z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**  ...
> +*/
> +int foo2_int(unsigned short *x, unsigned short * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 8000; i++)
> +    {
> +      x[i] = x[i] + y[i];
> +      sum += x[i];
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..62f7efde16811a282c1feffb97f0a229cd40482c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
> @@ -0,0 +1,81 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target aarch64_sve_hw }  */
> +/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only 
> --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
> -fno-schedule-insns2 -fdump-tree-vect-details" }*/
> +

Is there much value in the schedule-reorder flags for a runtime test here? If 
you’re not scanning the assembly they seem superfluous.
Ok with that changed.
Thanks,
Kyrill

> +inline char char_abs(char i) {
> +  return (i < 0 ? -i : i);
> +}
> +
> +__attribute__((noipa))
> +int foo_int(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int(unsigned short *x, unsigned short * restrict y,
> +      unsigned short * restrict z) {
> +  int sum = 0;
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo_int2(unsigned char *x, unsigned char * restrict y) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +     sum += char_abs(x[i] - y[i]);
> +  return sum;
> +}
> +
> +__attribute__((noipa))
> +int foo2_int2(unsigned short *x, unsigned short * restrict y,
> +       unsigned short * restrict z) {
> +  int sum = 0;
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      z[i] = x[i] + y[i];
> +      sum += z[i];
> +    }
> +  return sum;
> +}
> +
> +int main ()
> +{
> +  unsigned short a[100];
> +  unsigned short b[100];
> +  unsigned short r1[100];
> +  unsigned short r2[100];
> +  unsigned char c[100];
> +  unsigned char d[100];
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    {
> +      a[i] = c[i] = i;
> +      b[i] = d[i] = 100 - i;
> +    }
> +
> +  if (foo_int (c, d) != foo_int2 (c, d))
> +    __builtin_abort();
> +
> +
> +  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
> +    __builtin_abort();
> +
> +#pragma GCC novector
> +  for (int i = 0; i < 100; i++)
> +    if (r1[i] != r2[i])
> +      __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
> \ No newline at end of file
> 
> 
> -- 
> <rb19875.patch>

Re: [PATCH 7/9]AArch64: Implement widen_[us]sum using dotproduct for SVE [PR122069]

Reply via email to