Quoting from the final patch in the series: ------------------------------------------------------------------------ This patch adds support for reusing a main loop's reduction accumulator in an epilogue loop. This in turn lets the loops share a single piece of vector->scalar reduction code.
The patch has the following restrictions: (1) The epilogue reduction can only operate on a single vector (e.g. ncopies must be 1 for non-SLP reductions, and the group size must be <= the element count for SLP reductions). (2) Both loops must use the same vector mode for their accumulators. This means that the patch is restricted to targets that support --param vect-partial-vector-usage=1. (3) The reduction must be a standard “tree code” reduction. However, these restrictions could be lifted in future. For example, if the main loop operates on 128-bit vectors and the epilogue loop operates on 64-bit vectors, we could in future reduce the 128-bit vector by one stage and use the 64-bit result as the starting point for the epilogue result. The patch tries to handle chained SLP reductions, unchained SLP reductions and non-SLP reductions. It also handles cases in which the epilogue loop is entered directly (rather than via the main loop) and cases in which the epilogue loop can be skipped. ------------------------------------------------------------------------ However, it ended up being difficult to do that without some preparatory clean-ups. Some of them could probably stand on their own, but others are a bit “meh” without the final patch to justify them. The diff below shows the effect of the patch when compiling: unsigned short __attribute__((noipa)) add_loop (unsigned short *x, int n) { unsigned short res = 0; for (int i = 0; i < n; ++i) res += x[i]; return res; } with -O3 --param vect-partial-vector-usage=1 on an SVE target: add_loop: add_loop: .LFB0: .LFB0: .cfi_startproc .cfi_startproc mov x4, x0 < cmp w1, 0 cmp w1, 0 ble .L7 ble .L7 cnth x0 | cnth x4 sub w2, w1, #1 sub w2, w1, #1 sub w3, w0, #1 | sub w3, w4, #1 cmp w2, w3 cmp w2, w3 bcc .L8 bcc .L8 sub w0, w1, w0 | sub w4, w1, w4 mov x3, 0 mov x3, 0 cnth x5 cnth x5 mov z0.b, #0 mov z0.b, #0 ptrue p0.b, all ptrue p0.b, all .p2align 3,,7 .p2align 3,,7 .L4: .L4: ld1h z1.h, p0/z, [x4, x3, | ld1h z1.h, p0/z, [x0, x3, mov x2, x3 mov x2, x3 add x3, x3, x5 add x3, x3, x5 add z0.h, z0.h, z1.h add z0.h, z0.h, z1.h cmp w0, w3 | cmp w4, w3 bcs .L4 bcs .L4 uaddv d0, p0, z0.h < umov w0, v0.h[0] < inch x2 inch x2 and w0, w0, 65535 < cmp w1, w2 cmp w1, w2 beq .L2 | beq .L6 .L3: .L3: sub w1, w1, w2 sub w1, w1, w2 mov z1.b, #0 | add x2, x0, w2, uxtw 1 whilelo p0.h, wzr, w1 whilelo p0.h, wzr, w1 add x2, x4, w2, uxtw 1 | ld1h z1.h, p0/z, [x2] ptrue p1.b, all | add z0.h, p0/m, z0.h, z1. ld1h z0.h, p0/z, [x2] | .L6: sel z0.h, p0, z0.h, z1.h | ptrue p0.b, all uaddv d0, p1, z0.h | uaddv d0, p0, z0.h fmov x1, d0 | umov w0, v0.h[0] add w0, w0, w1, uxth < and w0, w0, 65535 and w0, w0, 65535 .L2: < ret ret .p2align 2,,3 .p2align 2,,3 .L7: .L7: mov w0, 0 mov w0, 0 ret ret .L8: .L8: mov w2, 0 mov w2, 0 mov w0, 0 | mov z0.b, #0 b .L3 b .L3 .cfi_endproc .cfi_endproc Kewen, could you give this a spin on Power 10 to see whether it works/helps there? I've attached a combined diff. Series tested on aarch64-linux-gnu and x86_64-linux-gnu. Richard
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c new file mode 100644 index 00000000000..fb817b73d77 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10.c @@ -0,0 +1,77 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, int n) +{ + unsigned short res = ~0; + for (int i = 0; i < n; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, int n) +{ + unsigned short res = ~0; + for (int i = 0; i < n; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, int n) +{ + unsigned short res = 0; + for (int i = 0; i < n; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c new file mode 100644 index 00000000000..1dd579be701 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_10_run.c @@ -0,0 +1,49 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_10.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 0) != 0 + || add_loop (x, 11) != 572 + || add_loop (x, 0x100) != 22016 + || add_loop (x, 0xfff) != 20480 + || max_loop (x, 0) != 0 + || max_loop (x, 11) != 132 + || max_loop (x, 0x100) != 65280 + || max_loop (x, 0xfff) != 65504 + || or_loop (x, 0) != 0 + || or_loop (x, 11) != 0xfe + || or_loop (x, 0x80) != 0x7ffe + || or_loop (x, 0xb4) != 0x7ffe + || or_loop (x, 0xb5) != 0xfffe + || eor_loop (x, 0) != 0 + || eor_loop (x, 11) != 0xe8 + || eor_loop (x, 0x100) != 0xcf00 + || eor_loop (x, 0xfff) != 0xa000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 0) != 65535 + || min_loop (x, 11) != 65403 + || min_loop (x, 0x100) != 255 + || min_loop (x, 0xfff) != 31 + || and_loop (x, 0) != 0xffff + || and_loop (x, 11) != 0xff01 + || and_loop (x, 0x80) != 0x8001 + || and_loop (x, 0xb4) != 0x8001 + || and_loop (x, 0xb5) != 1) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c new file mode 100644 index 00000000000..f99ef4aa865 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11.c @@ -0,0 +1,71 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, unsigned short res) +{ + for (int i = 0; i < 0xfff; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c new file mode 100644 index 00000000000..5b41560d2ef --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_11_run.c @@ -0,0 +1,34 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_11.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 42) != 20522 + || max_loop (x, 65503) != 65504 + || max_loop (x, 65505) != 65505 + || or_loop (x, 0) != 0xfffe + || or_loop (x, 1) != 0xffff + || eor_loop (x, 0) != 0xa000 + || eor_loop (x, 0xbfff) != 0x1fff) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 32) != 31 + || min_loop (x, 30) != 30 + || and_loop (x, 0xff) != 1 + || and_loop (x, 0) != 0) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c new file mode 100644 index 00000000000..d32b81a61bc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12.c @@ -0,0 +1,71 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x, int n, unsigned short res) +{ + for (int i = 0; i < n; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c new file mode 100644 index 00000000000..929b81a9705 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_12_run.c @@ -0,0 +1,66 @@ +/* { dg-do run { target aarch64_sve_hw } } */ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_12.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x, 0, 10) != 10 + || add_loop (x, 11, 42) != 614 + || add_loop (x, 0x100, 84) != 22100 + || add_loop (x, 0xfff, 20) != 20500 + || max_loop (x, 0, 10) != 10 + || max_loop (x, 11, 131) != 132 + || max_loop (x, 11, 133) != 133 + || max_loop (x, 0x100, 65279) != 65280 + || max_loop (x, 0x100, 65281) != 65281 + || max_loop (x, 0xfff, 65503) != 65504 + || max_loop (x, 0xfff, 65505) != 65505 + || or_loop (x, 0, 0x71) != 0x71 + || or_loop (x, 11, 0) != 0xfe + || or_loop (x, 11, 0xb3c) != 0xbfe + || or_loop (x, 0x80, 0) != 0x7ffe + || or_loop (x, 0x80, 1) != 0x7fff + || or_loop (x, 0xb4, 0) != 0x7ffe + || or_loop (x, 0xb4, 1) != 0x7fff + || or_loop (x, 0xb5, 0) != 0xfffe + || or_loop (x, 0xb5, 1) != 0xffff + || eor_loop (x, 0, 0x3e) != 0x3e + || eor_loop (x, 11, 0) != 0xe8 + || eor_loop (x, 11, 0x1ff) != 0x117 + || eor_loop (x, 0x100, 0) != 0xcf00 + || eor_loop (x, 0x100, 0xeee) != 0xc1ee + || eor_loop (x, 0xfff, 0) != 0xa000 + || eor_loop (x, 0xfff, 0x8888) != 0x2888) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x, 0, 10000) != 10000 + || min_loop (x, 11, 65404) != 65403 + || min_loop (x, 11, 65402) != 65402 + || min_loop (x, 0x100, 256) != 255 + || min_loop (x, 0x100, 254) != 254 + || min_loop (x, 0xfff, 32) != 31 + || min_loop (x, 0xfff, 30) != 30 + || and_loop (x, 0, 0x1234) != 0x1234 + || and_loop (x, 11, 0xffff) != 0xff01 + || and_loop (x, 11, 0xcdef) != 0xcd01 + || and_loop (x, 0x80, 0xffff) != 0x8001 + || and_loop (x, 0x80, 0xfffe) != 0x8000 + || and_loop (x, 0xb4, 0xffff) != 0x8001 + || and_loop (x, 0xb4, 0xfffe) != 0x8000 + || and_loop (x, 0xb5, 0xffff) != 1 + || and_loop (x, 0xb5, 0xfffe) != 0) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c new file mode 100644 index 00000000000..ce2b8f2fcdc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13.c @@ -0,0 +1,101 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +void __attribute__((noipa)) +add_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 += x[i * 2]; + res1 += x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +min_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 = res0 < x[i * 2] ? res0 : x[i * 2]; + res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +max_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 = res0 > x[i * 2] ? res0 : x[i * 2]; + res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +and_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 &= x[i * 2]; + res1 &= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +or_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 |= x[i * 2]; + res1 |= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +eor_loop (unsigned int *x, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < 0x7ff; ++i) + { + res0 ^= x[i * 2]; + res1 ^= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c new file mode 100644 index 00000000000..5514d8d6b3b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_13_run.c @@ -0,0 +1,61 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_13.c" + +int +main (void) +{ + unsigned int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + unsigned int add_res[2] = { 42, 1111 }; + add_loop (x, add_res); + if (add_res[0] != 968538154 + || add_res[1] != 964340823) + __builtin_abort (); + + unsigned int max_res1[2] = { 0, 0 }; + max_loop (x, max_res1); + if (max_res1[0] != 1048150 + || max_res1[1] != 1045506) + __builtin_abort (); + + unsigned int max_res2[2] = { 1048151, 1045507 }; + max_loop (x, max_res2); + if (max_res2[0] != 1048151 + || max_res2[1] != 1045507) + __builtin_abort (); + + unsigned int or_res[2] = { 0x1000000, 0x2000000 }; + or_loop (x, or_res); + if (or_res[0] != 0x10ffffe + || or_res[1] != 0x20ffffe) + __builtin_abort (); + + unsigned int eor_res[2] = { 0x1000000, 0x2000000 }; + eor_loop (x, eor_res); + if (eor_res[0] != 0x1010000 + || eor_res[1] != 0x20b5000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i] & 0xfffff; + + unsigned int min_res1[2] = { 500, 4000 }; + min_loop (x, min_res1); + if (min_res1[0] != 425 + || min_res1[1] != 3069) + __builtin_abort (); + + unsigned int min_res2[2] = { 424, 3068 }; + min_loop (x, min_res2); + if (min_res2[0] != 424 + || min_res2[1] != 3068) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c new file mode 100644 index 00000000000..3be611e4b37 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14.c @@ -0,0 +1,107 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +void __attribute__((noipa)) +add_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 += x[i * 2]; + res1 += x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +min_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 = res0 < x[i * 2] ? res0 : x[i * 2]; + res1 = res1 < x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +max_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 = res0 > x[i * 2] ? res0 : x[i * 2]; + res1 = res1 > x[i * 2 + 1] ? res1 : x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +and_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 &= x[i * 2]; + res1 &= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +or_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 |= x[i * 2]; + res1 |= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +void __attribute__((noipa)) +eor_loop (unsigned int *x, int n, unsigned int *res) +{ + unsigned int res0 = res[0]; + unsigned int res1 = res[1]; + for (int i = 0; i < n; ++i) + { + res0 ^= x[i * 2]; + res1 ^= x[i * 2 + 1]; + } + res[0] = res0; + res[1] = res1; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 2 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c new file mode 100644 index 00000000000..ccaa770e9b2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_14_run.c @@ -0,0 +1,187 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_14.c" + +int +main (void) +{ + unsigned int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + unsigned int add_res1[2] = { 11, 22 }; + add_loop (x, 0, add_res1); + if (add_res1[0] != 11 + || add_res1[1] != 22) + __builtin_abort (); + + unsigned int add_res2[2] = { 10, 20 }; + add_loop (x, 11, add_res2); + if (add_res2[0] != 1902 + || add_res2[1] != 2176) + __builtin_abort (); + + unsigned int add_res3[2] = { 15, 30 }; + add_loop (x, 0x100, add_res3); + if (add_res3[0] != 22435087 + || add_res3[1] != 22566686) + __builtin_abort (); + + unsigned int add_res4[2] = { 100, 200 }; + add_loop (x, 0x11f, add_res4); + if (add_res4[0] != 31602244 + || add_res4[1] != 31767656) + __builtin_abort (); + + unsigned int max_res1[2] = { 461, 500 }; + max_loop (x, 11, max_res1); + if (max_res1[0] != 462 + || max_res1[1] != 506) + __builtin_abort (); + + unsigned int max_res2[2] = { 463, 507 }; + max_loop (x, 11, max_res2); + if (max_res2[0] != 463 + || max_res2[1] != 507) + __builtin_abort (); + + unsigned int max_res3[2] = { 1000000, 1000000 }; + max_loop (x, 0x200, max_res3); + if (max_res3[0] != 1047552 + || max_res3[1] != 1045506) + __builtin_abort (); + + unsigned int max_res4[2] = { 1047553, 1045507 }; + max_loop (x, 0x200, max_res4); + if (max_res4[0] != 1047553 + || max_res4[1] != 1045507) + __builtin_abort (); + + unsigned int max_res5[2] = { 300000, 30000 }; + max_loop (x, 0x11f, max_res5); + if (max_res5[0] != 328902 + || max_res5[1] != 330050) + __builtin_abort (); + + unsigned int max_res6[2] = { 328903, 330051 }; + max_loop (x, 0x11f, max_res6); + if (max_res6[0] != 328903 + || max_res6[1] != 330051) + __builtin_abort (); + + unsigned int or_res1[2] = { 11, 22 }; + or_loop (x, 0, or_res1); + if (or_res1[0] != 11 + || or_res1[1] != 22) + __builtin_abort (); + + unsigned int or_res2[2] = { 0x200000, 0xe00000 }; + or_loop (x, 11, or_res2); + if (or_res2[0] != 0x2001fe + || or_res2[1] != 0xe001fe) + __builtin_abort (); + + unsigned int or_res3[2] = { 0x800000, 0x700000 }; + or_loop (x, 0x40, or_res3); + if (or_res3[0] != 0x803ffe + || or_res3[1] != 0x707ffe) + __builtin_abort (); + + unsigned int or_res4[2] = { 0x100001, 0x300000 }; + or_loop (x, 0x4f, or_res4); + if (or_res4[0] != 0x107fff + || or_res4[1] != 0x307ffe) + __builtin_abort (); + + unsigned int eor_res1[2] = { 11, 22 }; + eor_loop (x, 0, eor_res1); + if (eor_res1[0] != 11 + || eor_res1[1] != 22) + __builtin_abort (); + + unsigned int eor_res2[2] = { 0x2000ff, 0xe000ff }; + eor_loop (x, 11, eor_res2); + if (eor_res2[0] != 0x2001cf + || eor_res2[1] != 0xe000b7) + __builtin_abort (); + + unsigned int eor_res3[2] = { 0x805000, 0x70f000 }; + eor_loop (x, 0x100, eor_res3); + if (eor_res3[0] != 0x824200 + || eor_res3[1] != 0x77dc00) + __builtin_abort (); + + unsigned int eor_res4[2] = { 0x101201, 0x300f00 }; + eor_loop (x, 0x11f, eor_res4); + if (eor_res4[0] != 0x178801 + || eor_res4[1] != 0x337240) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i] & 0xfffff; + + unsigned int min_res1[2] = { 1048200, 1048100 }; + min_loop (x, 11, min_res1); + if (min_res1[0] != 1048113 + || min_res1[1] != 1048069) + __builtin_abort (); + + unsigned int min_res2[2] = { 1048112, 1048068 }; + min_loop (x, 11, min_res2); + if (min_res2[0] != 1048112 + || min_res2[1] != 1048068) + __builtin_abort (); + + unsigned int min_res3[2] = { 10000, 10000 }; + min_loop (x, 0x200, min_res3); + if (min_res3[0] != 1023 + || min_res3[1] != 3069) + __builtin_abort (); + + unsigned int min_res4[2] = { 1022, 3068 }; + min_loop (x, 0x200, min_res4); + if (min_res4[0] != 1022 + || min_res4[1] != 3068) + __builtin_abort (); + + unsigned int min_res5[2] = { 719680, 718530 }; + min_loop (x, 0x11f, min_res5); + if (min_res5[0] != 719673 + || min_res5[1] != 718525) + __builtin_abort (); + + unsigned int min_res6[2] = { 719672, 718524 }; + min_loop (x, 0x11f, min_res6); + if (min_res6[0] != 719672 + || min_res6[1] != 718524) + __builtin_abort (); + + unsigned int and_res1[2] = { 11, 22 }; + and_loop (x, 0, and_res1); + if (and_res1[0] != 11 + || and_res1[1] != 22) + __builtin_abort (); + + unsigned int and_res2[2] = { 0xf5cff, 0xf78ff }; + and_loop (x, 11, and_res2); + if (and_res2[0] != 0xf5c01 + || and_res2[1] != 0xf7801) + __builtin_abort (); + + unsigned int and_res3[2] = { 0x7efff, 0xecfff }; + and_loop (x, 0x40, and_res3); + if (and_res3[0] != 0x7c001 + || and_res3[1] != 0xe8001) + __builtin_abort (); + + unsigned int and_res4[2] = { 0xffffff, 0xffffff }; + and_loop (x, 0x4f, and_res4); + if (and_res4[0] != 0xf8001 + || and_res4[1] != 0xf8001) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c new file mode 100644 index 00000000000..15b1ade30e2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15.c @@ -0,0 +1,16 @@ +/* { dg-options "-O3 --param vect-partial-vector-usage=1" } */ + +int __attribute__((noipa)) +add_loop (int *x, int n, int res) +{ + for (int i = 0; i < n; ++i) + { + res += x[i * 2]; + res += x[i * 2 + 1]; + } + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c new file mode 100644 index 00000000000..3207fce5be3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_15_run.c @@ -0,0 +1,22 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_15.c" + +int +main (void) +{ + int x[N]; + for (int i = 0; i < N; ++i) + x[i] = ((i + 1) * (i + 2)) & 0xfffff; + + if (add_loop (x, 0, 33) != 33 + || add_loop (x, 11, 30) != 4078 + || add_loop (x, 0x100, 45) != 45001773 + || add_loop (x, 0x11f, 300) != 63369900) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c new file mode 100644 index 00000000000..b839821d6bb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9.c @@ -0,0 +1,77 @@ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +unsigned short __attribute__((noipa)) +add_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res += x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +min_loop (unsigned short *x) +{ + unsigned short res = ~0; + for (int i = 0; i < 0xfff; ++i) + res = res < x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +max_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res = res > x[i] ? res : x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +and_loop (unsigned short *x) +{ + unsigned short res = ~0; + for (int i = 0; i < 0xfff; ++i) + res &= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +or_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res |= x[i]; + return res; +} + +unsigned short __attribute__((noipa)) +eor_loop (unsigned short *x) +{ + unsigned short res = 0; + for (int i = 0; i < 0xfff; ++i) + res ^= x[i]; + return res; +} + +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tuaddv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tuminv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tumaxv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tandv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\torv\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\teorv\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c new file mode 100644 index 00000000000..aa248f53eaa --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_9_run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target aarch64_sve256_hw } } */ +/* { dg-options "-O3 -msve-vector-bits=256 --param vect-partial-vector-usage=1" } */ + +#define N 0x1100 + +#include "reduc_9.c" + +int +main (void) +{ + unsigned short x[N]; + for (int i = 0; i < N; ++i) + x[i] = (i + 1) * (i + 2); + + if (add_loop (x) != 20480 + || max_loop (x) != 65504 + || or_loop (x) != 0xfffe + || eor_loop (x) != 0xa000) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + x[i] = ~x[i]; + + if (min_loop (x) != 31 + || and_loop (x) != 1) + __builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 2909e8a0fc3..b7b0523e3c8 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -2457,6 +2457,31 @@ vect_update_epilogue_niters (loop_vec_info epilogue_vinfo, return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true); } +/* LOOP_VINFO is an epilogue loop and MAIN_LOOP_VALUE is available on exit + from the corresponding main loop. Return a value that is available in + LOOP_VINFO's preheader, using SKIP_VALUE if the main loop is skipped. + Passing a null SKIP_VALUE is equivalent to passing zero. */ + +tree +vect_get_main_loop_result (loop_vec_info loop_vinfo, tree main_loop_value, + tree skip_value) +{ + if (!loop_vinfo->main_loop_edge) + return main_loop_value; + + if (!skip_value) + skip_value = build_zero_cst (TREE_TYPE (main_loop_value)); + + tree phi_result = make_ssa_name (TREE_TYPE (main_loop_value)); + basic_block bb = loop_vinfo->main_loop_edge->dest; + gphi *new_phi = create_phi_node (phi_result, bb); + add_phi_arg (new_phi, main_loop_value, loop_vinfo->main_loop_edge, + UNKNOWN_LOCATION); + add_phi_arg (new_phi, skip_value, + loop_vinfo->skip_main_loop_edge, UNKNOWN_LOCATION); + return phi_result; +} + /* Function vect_do_peeling. Input: @@ -2986,6 +3011,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, skip_vector ? anchor : guard_bb, prob_epilog.invert (), irred_flag); + if (vect_epilogues) + epilogue_vinfo->skip_this_loop_edge = guard_e; slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, single_exit (epilog)); /* Only need to handle basic block before epilog loop if it's not @@ -3057,6 +3084,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, UNKNOWN_LOCATION); niters = PHI_RESULT (new_phi); + epilogue_vinfo->main_loop_edge = update_e; + epilogue_vinfo->skip_main_loop_edge = skip_e; } /* Set ADVANCE to the number of iterations performed by the previous diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index bc523d151c6..5e6c9b7c38a 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -19,6 +19,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_ALGORITHM #include "config.h" #include "system.h" #include "coretypes.h" @@ -823,6 +824,10 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) th (0), versioning_threshold (0), vectorization_factor (0), + main_loop_edge (nullptr), + skip_main_loop_edge (nullptr), + skip_this_loop_edge (nullptr), + reusable_accumulators (), max_vectorization_factor (0), mask_skip_niters (NULL_TREE), rgroup_compare_type (NULL_TREE), @@ -3248,23 +3253,15 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) } } -/* If there is a neutral value X such that SLP reduction NODE would not - be affected by the introduction of additional X elements, return that X, - otherwise return null. CODE is the code of the reduction and VECTOR_TYPE - is the vector type that would hold element X. REDUC_CHAIN is true if - the SLP statements perform a single reduction, false if each statement - performs an independent reduction. */ +/* If there is a neutral value X such that a reduction would not be affected + by the introduction of additional X elements, return that X, otherwise + return null. CODE is the code of the reduction and SCALAR_TYPE is type + of the scalar elements. If the reduction has just a single initial value + then INITIAL_VALUE is that value, otherwise it is null. */ static tree -neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, - tree_code code, bool reduc_chain) +neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value) { - vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); - stmt_vec_info stmt_vinfo = stmts[0]; - tree scalar_type = TREE_TYPE (vector_type); - class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; - gcc_assert (loop); - switch (code) { case WIDEN_SUM_EXPR: @@ -3284,13 +3281,7 @@ neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, case MAX_EXPR: case MIN_EXPR: - /* For MIN/MAX the initial values are neutral. A reduction chain - has only a single initial value, so that value is neutral for - all statements. */ - if (reduc_chain) - return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, - loop_preheader_edge (loop)); - return NULL_TREE; + return initial_value; default: return NULL_TREE; @@ -4621,64 +4612,58 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, prologue_cost, epilogue_cost); } +/* SEQ is a sequence of instructions that initialize the reduction + described by REDUC_INFO. Emit them in the appropriate place. */ +static void +vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, gimple *seq) +{ + if (reduc_info->reused_accumulator) + { + /* When reusing an accumulator from the main loop, we only need + initialization instructions if the main loop can be skipped. + In that case, emit the initialization instructions at the end + of the guard block that does the skip. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + gcc_assert (skip_edge); + gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src); + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); + } + else + { + /* The normal case: emit the initialization instructions on the + preheader edge. */ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq); + } +} /* Function get_initial_def_for_reduction Input: - STMT_VINFO - a stmt that performs a reduction operation in the loop. + REDUC_INFO - the info_for_reduction INIT_VAL - the initial value of the reduction variable + NEUTRAL_OP - a value that has no effect on the reduction, as per + neutral_op_for_reduction Output: - ADJUSTMENT_DEF - a tree that holds a value to be added to the final result - of the reduction (used for adjusting the epilog - see below). Return a vector variable, initialized according to the operation that STMT_VINFO performs. This vector will be used as the initial value of the vector of partial results. - Option1 (adjust in epilog): Initialize the vector as follows: - add/bit or/xor: [0,0,...,0,0] - mult/bit and: [1,1,...,1,1] - min/max/cond_expr: [init_val,init_val,..,init_val,init_val] - and when necessary (e.g. add/mult case) let the caller know - that it needs to adjust the result by init_val. - - Option2: Initialize the vector as follows: - add/bit or/xor: [init_val,0,0,...,0] - mult/bit and: [init_val,1,1,...,1] - min/max/cond_expr: [init_val,init_val,...,init_val] - and no adjustments are needed. - - For example, for the following code: - - s = init_val; - for (i=0;i<n;i++) - s = s + a[i]; - - STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'. - For a vector of 4 units, we want to return either [0,0,0,init_val], - or [0,0,0,0] and let the caller know that it needs to adjust - the result at the end by 'init_val'. - - FORNOW, we are using the 'adjust in epilog' scheme, because this way the - initialization vector is simpler (same element in all entries), if - ADJUSTMENT_DEF is not NULL, and Option2 otherwise. - - A cost model should help decide between these two schemes. */ + The value we need is a vector in which element 0 has value INIT_VAL + and every other element has value NEUTRAL_OP. */ static tree get_initial_def_for_reduction (loop_vec_info loop_vinfo, - stmt_vec_info stmt_vinfo, - enum tree_code code, tree init_val, - tree *adjustment_def) + stmt_vec_info reduc_info, + tree init_val, tree neutral_op) { class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree scalar_type = TREE_TYPE (init_val); tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); - tree def_for_init; tree init_def; - REAL_VALUE_TYPE real_init_val = dconst0; - int int_init_val = 0; gimple_seq stmts = NULL; gcc_assert (vectype); @@ -4686,115 +4671,64 @@ get_initial_def_for_reduction (loop_vec_info loop_vinfo, gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) || SCALAR_FLOAT_TYPE_P (scalar_type)); - gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) - || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); + gcc_assert (nested_in_vect_loop_p (loop, reduc_info) + || loop == (gimple_bb (reduc_info->stmt))->loop_father); - /* ADJUSTMENT_DEF is NULL when called from - vect_create_epilog_for_reduction to vectorize double reduction. */ - if (adjustment_def) - *adjustment_def = NULL; - - switch (code) + if (operand_equal_p (init_val, neutral_op)) { - case WIDEN_SUM_EXPR: - case DOT_PROD_EXPR: - case SAD_EXPR: - case PLUS_EXPR: - case MINUS_EXPR: - case BIT_IOR_EXPR: - case BIT_XOR_EXPR: - case MULT_EXPR: - case BIT_AND_EXPR: - { - if (code == MULT_EXPR) - { - real_init_val = dconst1; - int_init_val = 1; - } - - if (code == BIT_AND_EXPR) - int_init_val = -1; - - if (SCALAR_FLOAT_TYPE_P (scalar_type)) - def_for_init = build_real (scalar_type, real_init_val); - else - def_for_init = build_int_cst (scalar_type, int_init_val); - - if (adjustment_def || operand_equal_p (def_for_init, init_val, 0)) - { - /* Option1: the first element is '0' or '1' as well. */ - if (!operand_equal_p (def_for_init, init_val, 0)) - *adjustment_def = init_val; - init_def = gimple_build_vector_from_val (&stmts, vectype, - def_for_init); - } - else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) - { - /* Option2 (variable length): the first element is INIT_VAL. */ - init_def = gimple_build_vector_from_val (&stmts, vectype, - def_for_init); - init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, - vectype, init_def, init_val); - } - else - { - /* Option2: the first element is INIT_VAL. */ - tree_vector_builder elts (vectype, 1, 2); - elts.quick_push (init_val); - elts.quick_push (def_for_init); - init_def = gimple_build_vector (&stmts, &elts); - } - } - break; - - case MIN_EXPR: - case MAX_EXPR: - case COND_EXPR: - { - init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); - init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); - } - break; - - default: - gcc_unreachable (); + /* If both elements are equal then the vector described above is + just a splat. */ + neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); + init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op); + } + else + { + neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op); + init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) + { + /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into + element 0. */ + init_def = gimple_build_vector_from_val (&stmts, vectype, + neutral_op); + init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, + vectype, init_def, init_val); + } + else + { + /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */ + tree_vector_builder elts (vectype, 1, 2); + elts.quick_push (init_val); + elts.quick_push (neutral_op); + init_def = gimple_build_vector (&stmts, &elts); + } } if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts); return init_def; } -/* Get at the initial defs for the reduction PHIs in SLP_NODE. - NUMBER_OF_VECTORS is the number of vector defs to create. - If NEUTRAL_OP is nonnull, introducing extra elements of that - value will not change the result. */ +/* Get at the initial defs for the reduction PHIs for REDUC_INFO, + which performs a reduction involving GROUP_SIZE scalar statements. + NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP + is nonnull, introducing extra elements of that value will not change the + result. */ static void -get_initial_defs_for_reduction (vec_info *vinfo, - slp_tree slp_node, +get_initial_defs_for_reduction (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info, vec<tree> *vec_oprnds, unsigned int number_of_vectors, - bool reduc_chain, tree neutral_op) + unsigned int group_size, tree neutral_op) { - vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); - stmt_vec_info stmt_vinfo = stmts[0]; + vec<tree> &initial_values = reduc_info->reduc_initial_values; unsigned HOST_WIDE_INT nunits; unsigned j, number_of_places_left_in_vector; - tree vector_type; - unsigned int group_size = stmts.length (); + tree vector_type = STMT_VINFO_VECTYPE (reduc_info); unsigned int i; - class loop *loop; - - vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); - - gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); - loop = (gimple_bb (stmt_vinfo->stmt))->loop_father; - gcc_assert (loop); - edge pe = loop_preheader_edge (loop); - - gcc_assert (!reduc_chain || neutral_op); + gcc_assert (group_size == initial_values.length () || neutral_op); /* NUMBER_OF_COPIES is the number of times we need to use the same values in created vectors. It is greater than 1 if unrolling is performed. @@ -4824,18 +4758,13 @@ get_initial_defs_for_reduction (vec_info *vinfo, { tree op; i = j % group_size; - stmt_vinfo = stmts[i]; /* Get the def before the loop. In reduction chain we have only one initial value. Else we have as many as PHIs in the group. */ - if (reduc_chain) - op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); - else if (((vec_oprnds->length () + 1) * nunits - - number_of_places_left_in_vector >= group_size) - && neutral_op) + if (i >= initial_values.length () || (j > i && neutral_op)) op = neutral_op; else - op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); + op = initial_values[i]; /* Create 'vect_ = {op0,op1,...,opn}'. */ number_of_places_left_in_vector--; @@ -4871,8 +4800,8 @@ get_initial_defs_for_reduction (vec_info *vinfo, { /* First time round, duplicate ELTS to fill the required number of vectors. */ - duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, - number_of_vectors, *vec_oprnds); + duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type, + elts, number_of_vectors, *vec_oprnds); break; } vec_oprnds->quick_push (init); @@ -4884,7 +4813,7 @@ get_initial_defs_for_reduction (vec_info *vinfo, } } if (ctor_seq != NULL) - gsi_insert_seq_on_edge_immediate (pe, ctor_seq); + vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq); } /* For a statement STMT_INFO taking part in a reduction operation return @@ -4906,15 +4835,107 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info) } else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) { - edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father); - stmt_vec_info info - = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe)); + stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi)); if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) stmt_info = info; } return stmt_info; } +/* PHI is a reduction in LOOP_VINFO that we are going to vectorize using vector + type VECTYPE. See if LOOP_VINFO is an epilogue loop whose main loop had a + matching reduction that we can build on. Adjust REDUC_INFO and return true + if so, otherwise return false. */ + +static bool +vect_find_reusable_accumulator (loop_vec_info loop_vinfo, + stmt_vec_info reduc_info) +{ + loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + if (!main_loop_vinfo) + return false; + + if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION) + return false; + + unsigned int num_phis = reduc_info->reduc_initial_values.length (); + auto_vec<tree, 16> main_loop_results (num_phis); + auto_vec<tree, 16> initial_values (num_phis); + if (edge main_loop_edge = loop_vinfo->main_loop_edge) + { + /* The epilogue loop can be entered either from the main loop or + from an earlier guard block. */ + edge skip_edge = loop_vinfo->skip_main_loop_edge; + for (tree incoming_value : reduc_info->reduc_initial_values) + { + /* Look for: + + INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop), + INITIAL_VALUE(guard block)>. */ + gcc_assert (TREE_CODE (incoming_value) == SSA_NAME); + + gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value)); + gcc_assert (gimple_bb (phi) == main_loop_edge->dest); + + tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge); + tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge); + + main_loop_results.quick_push (from_main_loop); + initial_values.quick_push (from_skip); + } + } + else + /* The main loop dominates the epilogue loop. */ + main_loop_results.splice (reduc_info->reduc_initial_values); + + /* See if the main loop has the kind of accumulator we need. */ + vect_reusable_accumulator *accumulator + = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]); + if (!accumulator + || num_phis != accumulator->reduc_info->reduc_scalar_results.length () + || !std::equal (main_loop_results.begin (), main_loop_results.end (), + accumulator->reduc_info->reduc_scalar_results.begin ())) + return false; + + /* For now, only handle the case in which both loops are operating on the + same vector types. In future we could reduce wider vectors to narrower + ones as well. */ + tree vectype = STMT_VINFO_VECTYPE (reduc_info); + tree old_vectype = TREE_TYPE (accumulator->reduc_input); + if (!useless_type_conversion_p (old_vectype, vectype)) + return false; + + /* Non-SLP reductions might apply an adjustment after the reduction + operation, in order to simplify the initialization of the accumulator. + If the epilogue loop carries on from where the main loop left off, + it should apply the same adjustment to the final reduction result. + + If the epilogue loop can also be entered directly (rather than via + the main loop), we need to be able to handle that case in the same way, + with the same adjustment. (In principle we could add a PHI node + to select the correct adjustment, but in practice that shouldn't be + necessary.) */ + tree main_adjustment + = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info); + if (loop_vinfo->main_loop_edge && main_adjustment) + { + gcc_assert (num_phis == 1); + tree initial_value = initial_values[0]; + /* Check that we can use INITIAL_VALUE as the adjustment and + initialize the accumulator with a neutral value instead. */ + if (!operand_equal_p (initial_value, main_adjustment)) + return false; + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value), + code, initial_value); + } + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment; + reduc_info->reduc_initial_values.truncate (0); + reduc_info->reduc_initial_values.splice (initial_values); + reduc_info->reused_accumulator = accumulator; + return true; +} + /* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction @@ -5005,15 +5026,18 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, imm_use_iterator imm_iter, phi_imm_iter; use_operand_p use_p, phi_use_p; gimple *use_stmt; - bool nested_in_vect_loop = false; - auto_vec<gimple *> new_phis; + auto_vec<tree> reduc_inputs; int j, i; - auto_vec<tree> scalar_results; + vec<tree> &scalar_results = reduc_info->reduc_scalar_results; unsigned int group_size = 1, k; auto_vec<gimple *> phis; - bool slp_reduc = false; + /* SLP reduction without reduction chain, e.g., + # a1 = phi <a2, a0> + # b1 = phi <b2, b0> + a2 = operation (a1) + b2 = operation (b1) */ + bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); bool direct_slp_reduc; - tree new_phi_result; tree induction_index = NULL_TREE; if (slp_node) @@ -5023,38 +5047,39 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { outer_loop = loop; loop = loop->inner; - nested_in_vect_loop = true; - gcc_assert (!slp_node); + gcc_assert (!slp_node && double_reduc); } - gcc_assert (!nested_in_vect_loop || double_reduc); vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); gcc_assert (vectype); mode = TYPE_MODE (vectype); - tree initial_def = NULL; tree induc_val = NULL_TREE; tree adjustment_def = NULL; if (slp_node) ; else { - /* Get at the scalar def before the loop, that defines the initial value - of the reduction variable. */ - initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, - loop_preheader_edge (loop)); /* Optimize: for induction condition reduction, if we can't use zero for induc_val, use initial_def. */ if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); else if (double_reduc) ; - else if (nested_in_vect_loop) - ; else adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); } + stmt_vec_info single_live_out_stmt[] = { stmt_info }; + array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt; + if (slp_reduc) + /* All statements produce live-out values. */ + live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node); + else if (slp_node) + /* The last statement in the reduction chain produces the live-out + value. */ + single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; + unsigned vec_num; int ncopies; if (slp_node) @@ -5205,31 +5230,28 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, if (double_reduc) loop = outer_loop; exit_bb = single_exit (loop)->dest; - new_phis.create (slp_node ? vec_num : ncopies); + exit_gsi = gsi_after_labels (exit_bb); + reduc_inputs.create (slp_node ? vec_num : ncopies); for (unsigned i = 0; i < vec_num; i++) { + gimple_seq stmts = NULL; if (slp_node) def = vect_get_slp_vect_def (slp_node, i); else def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]); for (j = 0; j < ncopies; j++) - { + { tree new_def = copy_ssa_name (def); - phi = create_phi_node (new_def, exit_bb); - if (j == 0) - new_phis.quick_push (phi); - else - { - def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); - new_phis.quick_push (phi); - } - - SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); - } + phi = create_phi_node (new_def, exit_bb); + if (j) + def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]); + SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); + new_def = gimple_convert (&stmts, vectype, new_def); + reduc_inputs.quick_push (new_def); + } + gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); } - exit_gsi = gsi_after_labels (exit_bb); - /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 (i.e. when reduc_fn is not available) and in the final adjustment code (if needed). Also get the original scalar reduction variable as @@ -5253,13 +5275,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); - /* SLP reduction without reduction chain, e.g., - # a1 = phi <a2, a0> - # b1 = phi <b2, b0> - a2 = operation (a1) - b2 = operation (b1) */ - slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); - /* True if we should implement SLP_REDUC using native reduction operations instead of scalar operations. */ direct_slp_reduc = (reduc_fn != IFN_LAST @@ -5271,52 +5286,60 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, a2 = operation (a1) a3 = operation (a2), - we may end up with more than one vector result. Here we reduce them to - one vector. */ - if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) + we may end up with more than one vector result. Here we reduce them + to one vector. + + The same is true if we couldn't use a single defuse cycle. */ + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) + || direct_slp_reduc + || ncopies > 1) { gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (k = 1; k < new_phis.length (); k++) - { - gimple *next_phi = new_phis[k]; - tree second_vect = PHI_RESULT (next_phi); - second_vect = gimple_convert (&stmts, vectype, second_vect); - first_vect = gimple_build (&stmts, code, vectype, - first_vect, second_vect); - } + tree single_input = reduc_inputs[0]; + for (k = 1; k < reduc_inputs.length (); k++) + single_input = gimple_build (&stmts, code, vectype, + single_input, reduc_inputs[k]); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + reduc_inputs.truncate (0); + reduc_inputs.safe_push (single_input); } - /* Likewise if we couldn't use a single defuse cycle. */ - else if (ncopies > 1) + + tree orig_reduc_input = reduc_inputs[0]; + + /* If this loop is an epilogue loop that can be skipped after the + main loop, we can only share a reduction operation between the + main loop and the epilogue if we put it at the target of the + skip edge. + + We can still reuse accumulators if this check fails. Doing so has + the minor(?) benefit of making the epilogue loop's scalar result + independent of the main loop's scalar result. */ + bool unify_with_main_loop_p = false; + if (reduc_info->reused_accumulator + && loop_vinfo->skip_this_loop_edge + && single_succ_p (exit_bb) + && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest) { - gimple_seq stmts = NULL; - tree first_vect = PHI_RESULT (new_phis[0]); - first_vect = gimple_convert (&stmts, vectype, first_vect); - for (int k = 1; k < ncopies; ++k) - { - tree second_vect = PHI_RESULT (new_phis[k]); - second_vect = gimple_convert (&stmts, vectype, second_vect); - first_vect = gimple_build (&stmts, code, vectype, - first_vect, second_vect); - } - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - new_phi_result = first_vect; - new_phis.truncate (0); - new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); + unify_with_main_loop_p = true; + + basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest; + reduc_inputs[0] = make_ssa_name (vectype); + gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block); + add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb), + UNKNOWN_LOCATION); + add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input, + loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION); + exit_gsi = gsi_after_labels (reduc_block); } - else - new_phi_result = PHI_RESULT (new_phis[0]); + + /* Shouldn't be used beyond this point. */ + exit_bb = nullptr; if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION && reduc_fn != IFN_LAST) { - /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing + /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing various data values where the condition matched and another vector (INDUCTION_INDEX) containing all the indexes of those matches. We need to extract the last matching index (which will be the index with @@ -5346,10 +5369,6 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Vector of {0, 0, 0,...}. */ tree zero_vec = build_zero_cst (vectype); - gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); - gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - /* Find maximum value from the vector of found indexes. */ tree max_index = make_ssa_name (index_scalar_type); gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, @@ -5367,7 +5386,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes with the vector (INDUCTION_INDEX) of found indexes, choosing values - from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) + from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC) otherwise. Only one value should match, resulting in a vector (VEC_COND) with one data value and the rest zeros. In the case where the loop never made any matches, every index will @@ -5386,7 +5405,8 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, zero. */ tree vec_cond = make_ssa_name (vectype); gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, - vec_compare, new_phi_result, + vec_compare, + reduc_inputs[0], zero_vec); gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); @@ -5416,7 +5436,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Convert the reduced value back to the result type and set as the result. */ - stmts = NULL; + gimple_seq stmts = NULL; new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, data_reduc); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); @@ -5434,7 +5454,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, val = data_reduc[i], idx_val = induction_index[i]; return val; */ - tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); + tree data_eltype = TREE_TYPE (vectype); tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); @@ -5458,7 +5478,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, build3 (BIT_FIELD_REF, data_eltype, - new_phi_result, + reduc_inputs[0], bitsize_int (el_size), bitsize_int (off))); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); @@ -5510,10 +5530,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, "Reduce using direct vector reduction.\n"); gimple_seq stmts = NULL; - new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); - vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); + vec_elem_type = TREE_TYPE (vectype); new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), - vec_elem_type, new_phi_result); + vec_elem_type, reduc_inputs[0]); new_temp = gimple_convert (&stmts, scalar_type, new_temp); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); @@ -5526,6 +5545,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5543,12 +5563,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, neutral value. We can then do a normal reduction on each vector. */ /* Enforced by vectorizable_reduction. */ - gcc_assert (new_phis.length () == 1); + gcc_assert (reduc_inputs.length () == 1); gcc_assert (pow2p_hwi (group_size)); - slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; - vec<stmt_vec_info> orig_phis - = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); gimple_seq seq = NULL; /* Build a vector {0, 1, 2, ...}, with the same number of elements @@ -5571,10 +5588,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, tree neutral_op = NULL_TREE; if (slp_node) { - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info); - neutral_op - = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, - vectype, code, first != NULL); + tree initial_value = NULL_TREE; + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) + initial_value = reduc_info->reduc_initial_values[0]; + neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code, + initial_value); } if (neutral_op) vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -5586,9 +5604,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, for MIN and MAX reduction, for example. */ if (!neutral_op) { - tree scalar_value - = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, - loop_preheader_edge (loop)); + tree scalar_value = reduc_info->reduc_initial_values[i]; scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), scalar_value); vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -5599,7 +5615,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, sel[j] = (index[j] == i); - which selects the elements of NEW_PHI_RESULT that should + which selects the elements of REDUC_INPUTS[0] that should be included in the result. */ tree compare_val = build_int_cst (index_elt_type, i); compare_val = build_vector_from_val (index_type, compare_val); @@ -5608,11 +5624,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* Calculate the equivalent of: - vec = seq ? new_phi_result : vector_identity; + vec = seq ? reduc_inputs[0] : vector_identity; VEC is now suitable for a full vector reduction. */ tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, - sel, new_phi_result, vector_identity); + sel, reduc_inputs[0], vector_identity); /* Do the reduction and convert it to the appropriate type. */ tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), @@ -5627,7 +5643,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, bool reduce_with_shift; tree vec_temp; - gcc_assert (slp_reduc || new_phis.length () == 1); + gcc_assert (slp_reduc || reduc_inputs.length () == 1); /* See if the target wants to do the final (shift) reduction in a vector mode of smaller size and first reduce upper/lower @@ -5637,7 +5653,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); unsigned nunits1 = nunits; if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode - && new_phis.length () == 1) + && reduc_inputs.length () == 1) { nunits1 = GET_MODE_NUNITS (mode1).to_constant (); /* For SLP reductions we have to make sure lanes match up, but @@ -5669,7 +5685,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, /* First reduce the vector to the desired vector size we should do shift reduction on by combining upper and lower halves. */ - new_temp = new_phi_result; + new_temp = reduc_inputs[0]; while (nunits > nunits1) { nunits /= 2; @@ -5748,7 +5764,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, new_temp = make_ssa_name (vectype1); epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); - new_phis[0] = epilog_stmt; + reduc_inputs[0] = new_temp; } if (reduce_with_shift && !slp_reduc) @@ -5829,13 +5845,9 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, int element_bitsize = tree_to_uhwi (bitsize); tree compute_type = TREE_TYPE (vectype); gimple_seq stmts = NULL; - FOR_EACH_VEC_ELT (new_phis, i, new_phi) + FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp) { int bit_offset; - if (gimple_code (new_phi) == GIMPLE_PHI) - vec_temp = PHI_RESULT (new_phi); - else - vec_temp = gimple_assign_lhs (new_phi); new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, vec_temp, bitsize, bitsize_zero_node); @@ -5882,6 +5894,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, first_res, res); scalar_results[j % group_size] = new_res; } + scalar_results.truncate (group_size); for (k = 0; k < group_size; k++) scalar_results[k] = gimple_convert (&stmts, scalar_type, scalar_results[k]); @@ -5905,6 +5918,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, the same as initial_def already. */ tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, induc_val); + tree initial_def = reduc_info->reduc_initial_values[0]; tree tmp = make_ssa_name (new_scalar_dest); epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, @@ -5923,13 +5937,12 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { gcc_assert (!slp_reduc); gimple_seq stmts = NULL; - if (nested_in_vect_loop) + if (double_reduc) { - new_phi = new_phis[0]; gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); new_temp = gimple_build (&stmts, code, vectype, - PHI_RESULT (new_phi), adjustment_def); + reduc_inputs[0], adjustment_def); } else { @@ -5942,21 +5955,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, epilog_stmt = gimple_seq_last_stmt (stmts); gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); - if (nested_in_vect_loop) - { - if (!double_reduc) - scalar_results.quick_push (new_temp); - else - scalar_results[0] = new_temp; - } - else - scalar_results[0] = new_temp; - - new_phis[0] = epilog_stmt; + scalar_results[0] = new_temp; } + /* Record this operation if it could be reused by the epilogue loop. */ + if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION + && !double_reduc) + loop_vinfo->reusable_accumulators.put (scalar_results[0], + { orig_reduc_input, reduc_info }); + if (double_reduc) - loop = loop->inner; + loop = outer_loop; /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit phis with new adjusted scalar results, i.e., replace use <s_out0> @@ -5983,47 +5992,11 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, use <s_out4> use <s_out4> */ - - /* In SLP reduction chain we reduce vector results into one vector if - necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the - LHS of the last stmt in the reduction chain, since we are looking for - the loop exit phi node. */ - if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) + gcc_assert (live_out_stmts.size () == scalar_results.length ()); + for (k = 0; k < live_out_stmts.size (); k++) { - stmt_vec_info dest_stmt_info - = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]); - scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt); - group_size = 1; - } - - /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in - case that REDUC_GROUP_SIZE is greater than vectorization factor). - Therefore, we need to match SCALAR_RESULTS with corresponding statements. - The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results - correspond to the first vector stmt, etc. - (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ - if (group_size > new_phis.length ()) - gcc_assert (!(group_size % new_phis.length ())); - - for (k = 0; k < group_size; k++) - { - if (slp_reduc) - { - stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; - - orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); - /* SLP statements can't participate in patterns. */ - gcc_assert (!orig_stmt_info); - scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); - } - - if (nested_in_vect_loop) - { - if (double_reduc) - loop = outer_loop; - else - gcc_unreachable (); - } + stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]); + scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); phis.create (3); /* Find the loop-closed-use at the loop exit of the original scalar @@ -6058,6 +6031,17 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo, { /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); + + /* Look for a single use at the target of the skip edge. */ + if (unify_with_main_loop_p) + { + use_operand_p use_p; + gimple *user; + if (!single_imm_use (orig_name, &use_p, &user)) + gcc_unreachable (); + orig_name = gimple_get_lhs (user); + } + scalar_result = scalar_results[k]; FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) { @@ -6830,10 +6814,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, else if (cond_reduc_dt == vect_constant_def) { enum vect_def_type cond_initial_dt; - tree cond_initial_val - = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop)); - - gcc_assert (cond_reduc_val != NULL_TREE); + tree cond_initial_val = vect_phi_initial_value (reduc_def_phi); vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); if (cond_initial_dt == vect_constant_def && types_compatible_p (TREE_TYPE (cond_initial_val), @@ -7026,9 +7007,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, /* For SLP reductions, see if there is a neutral value we can use. */ tree neutral_op = NULL_TREE; if (slp_node) - neutral_op = neutral_op_for_slp_reduction - (slp_node_instance->reduc_phis, vectype_out, orig_code, - REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); + { + tree initial_value = NULL_TREE; + if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL) + initial_value = vect_phi_initial_value (reduc_def_phi); + neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out), + orig_code, initial_value); + } if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) { @@ -7578,7 +7563,7 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, vectype_out); /* Get the loop-entry arguments. */ - tree vec_initial_def; + tree vec_initial_def = NULL_TREE; auto_vec<tree> vec_initial_defs; if (slp_node) { @@ -7592,22 +7577,40 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, else { gcc_assert (slp_node == slp_node_instance->reduc_phis); - stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); - tree neutral_op - = neutral_op_for_slp_reduction (slp_node, vectype_out, - STMT_VINFO_REDUC_CODE (reduc_info), - first != NULL); - get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis, - &vec_initial_defs, vec_num, - first != NULL, neutral_op); + vec<tree> &initial_values = reduc_info->reduc_initial_values; + vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node); + + unsigned int num_phis = stmts.length (); + if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info)) + num_phis = 1; + initial_values.reserve (num_phis); + for (unsigned int i = 0; i < num_phis; ++i) + { + gphi *this_phi = as_a<gphi *> (stmts[i]->stmt); + initial_values.quick_push (vect_phi_initial_value (this_phi)); + } + if (vec_num == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!initial_values.is_empty ()) + { + tree initial_value + = (num_phis == 1 ? initial_values[0] : NULL_TREE); + tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (vectype_out), + code, initial_value); + get_initial_defs_for_reduction (loop_vinfo, reduc_info, + &vec_initial_defs, vec_num, + stmts.length (), neutral_op); + } } } else { /* Get at the scalar def before the loop, that defines the initial value of the reduction variable. */ - tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi, - loop_preheader_edge (loop)); + tree initial_def = vect_phi_initial_value (phi); + reduc_info->reduc_initial_values.safe_push (initial_def); /* Optimize: if initial_def is for REDUC_MAX smaller than the base and we can't use zero for induc_val, use initial_def. Similarly for REDUC_MIN and initial_def larger than the base. */ @@ -7627,9 +7630,6 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; } vec_initial_def = build_vector_from_val (vectype_out, induc_val); - vec_initial_defs.create (ncopies); - for (i = 0; i < ncopies; ++i) - vec_initial_defs.quick_push (vec_initial_def); } else if (nested_cycle) { @@ -7639,23 +7639,59 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, ncopies, initial_def, &vec_initial_defs); } + else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION + || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) + /* Fill the initial vector with the initial scalar value. */ + vec_initial_def + = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, + initial_def, initial_def); else { - tree adjustment_def = NULL_TREE; - tree *adjustment_defp = &adjustment_def; - enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); - if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) - adjustment_defp = NULL; - vec_initial_def - = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code, - initial_def, adjustment_defp); - STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def; - vec_initial_defs.create (ncopies); - for (i = 0; i < ncopies; ++i) - vec_initial_defs.quick_push (vec_initial_def); + if (ncopies == 1) + vect_find_reusable_accumulator (loop_vinfo, reduc_info); + if (!reduc_info->reduc_initial_values.is_empty ()) + { + initial_def = reduc_info->reduc_initial_values[0]; + enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); + tree neutral_op + = neutral_op_for_reduction (TREE_TYPE (initial_def), + code, initial_def); + gcc_assert (neutral_op); + /* Try to simplify the vector initialization by applying an + adjustment after the reduction has been performed. */ + if (!reduc_info->reused_accumulator + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def + && !operand_equal_p (neutral_op, initial_def)) + { + STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) + = initial_def; + initial_def = neutral_op; + } + vec_initial_def + = get_initial_def_for_reduction (loop_vinfo, reduc_info, + initial_def, neutral_op); + } } } + if (vec_initial_def) + { + vec_initial_defs.create (ncopies); + for (i = 0; i < ncopies; ++i) + vec_initial_defs.quick_push (vec_initial_def); + } + + if (auto *accumulator = reduc_info->reused_accumulator) + { + if (loop_vinfo->main_loop_edge) + vec_initial_defs[0] + = vect_get_main_loop_result (loop_vinfo, accumulator->reduc_input, + vec_initial_defs[0]); + else + vec_initial_defs.safe_push (accumulator->reduc_input); + gcc_assert (vec_initial_defs.length () == 1); + } + /* Generate the reduction PHIs upfront. */ for (i = 0; i < vec_num; i++) { @@ -8253,8 +8289,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, return true; } - init_expr = PHI_ARG_DEF_FROM_EDGE (phi, - loop_preheader_edge (iv_loop)); + init_expr = vect_phi_initial_value (phi); gimple_seq stmts = NULL; if (!nested_in_vect_loop) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 9748043f3ee..f1035a83826 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -694,6 +694,8 @@ vec_info::new_stmt_vec_info (gimple *stmt) STMT_VINFO_SLP_VECT_ONLY (res) = false; STMT_VINFO_SLP_VECT_ONLY_PATTERN (res) = false; STMT_VINFO_VEC_STMTS (res) = vNULL; + res->reduc_initial_values = vNULL; + res->reduc_scalar_results = vNULL; if (is_a <loop_vec_info> (this) && gimple_code (stmt) == GIMPLE_PHI @@ -755,6 +757,8 @@ vec_info::free_stmt_vec_info (stmt_vec_info stmt_info) release_ssa_name (lhs); } + stmt_info->reduc_initial_values.release (); + stmt_info->reduc_scalar_results.release (); STMT_VINFO_SIMD_CLONE_INFO (stmt_info).release (); STMT_VINFO_VEC_STMTS (stmt_info).release (); free (stmt_info); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index fa28336d429..ed7a7738880 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-hash-traits.h" #include "target.h" #include "internal-fn.h" - +#include "tree-ssa-operands.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -551,6 +551,18 @@ typedef auto_vec<rgroup_controls> vec_loop_lens; typedef auto_vec<std::pair<data_reference*, tree> > drs_init_vec; +/* Information about a reduction accumulator from the main loop that could + conceivably be reused as the input to a reduction in an epilogue loop. */ +struct vect_reusable_accumulator { + /* The final value of the accumulator, which forms the input to the + reduction operation. */ + tree reduc_input; + + /* The stmt_vec_info that describes the reduction (i.e. the one for + which is_reduc_info is true). */ + stmt_vec_info reduc_info; +}; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -588,6 +600,23 @@ public: /* Unrolling factor */ poly_uint64 vectorization_factor; + /* If this loop is an epilogue loop whose main loop can be skipped, + MAIN_LOOP_EDGE is the edge from the main loop to this loop's + preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the + main loop and goes straight to this loop's preheader. + + Both fields are null otherwise. */ + edge main_loop_edge; + edge skip_main_loop_edge; + + /* If this loop is an epilogue loop that might be skipped after executing + the main loop, this edge is the one that skips the epilogue. */ + edge skip_this_loop_edge; + + /* After vectorization, maps live-out SSA names to information about + the reductions that generated them. */ + hash_map<tree, vect_reusable_accumulator> reusable_accumulators; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1186,6 +1215,21 @@ public: /* The vector type for performing the actual reduction. */ tree reduc_vectype; + /* If IS_REDUC_INFO is true and if the reduction is operating on N + elements in parallel, this vector gives the initial values of these + N elements. */ + vec<tree> reduc_initial_values; + + /* If IS_REDUC_INFO is true and if the reduction is operating on N + elements in parallel, this vector gives the scalar result of each + reduction. */ + vec<tree> reduc_scalar_results; + + /* Only meaningful if IS_REDUC_INFO. If non-null, the reduction is + being performed by an epilogue loop and we have decided to reuse + this accumulator from the main loop. */ + vect_reusable_accumulator *reused_accumulator; + /* Whether we force a single cycle PHI during reduction vectorization. */ bool force_single_cycle; @@ -1369,6 +1413,19 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info) && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father)); } +/* PHI is either a scalar reduction phi or a scalar induction phi. + Return the initial value of the variable on entry to the containing + loop. */ + +static inline tree +vect_phi_initial_value (gphi *phi) +{ + basic_block bb = gimple_bb (phi); + edge pe = loop_preheader_edge (bb->loop_father); + gcc_assert (pe->dest == bb); + return PHI_ARG_DEF_FROM_EDGE (phi, pe); +} + /* Return true if STMT_INFO should produce a vector mask type rather than a normal nonmask type. */ @@ -1799,6 +1856,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *); extern class loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool, tree *); +extern tree vect_get_main_loop_result (loop_vec_info, tree, tree = NULL_TREE); extern void vect_prepare_for_masked_peels (loop_vec_info); extern dump_user_location_t find_loop_location (class loop *); extern bool vect_can_advance_ivs_p (loop_vec_info);