https://gcc.gnu.org/g:e84e5d034124c6733d3b36d8623c56090d4d17f7
commit r15-3767-ge84e5d034124c6733d3b36d8623c56090d4d17f7 Author: Tamar Christina <tamar.christ...@arm.com> Date: Sun Sep 22 13:34:10 2024 +0100 aarch64: Take into account when VF is higher than known scalar iters Consider low overhead loops like: void foo (char *restrict a, int *restrict b, int *restrict c, int n) { for (int i = 0; i < 9; i++) { int res = c[i]; int t = b[i]; if (a[i] != 0) res = t; c[i] = res; } } For such loops we use latency only costing since the loop bounds is known and small. The current costing however does not consider the case where niters < VF. So when comparing the scalar vs vector costs it doesn't keep in mind that the scalar code can't perform VF iterations. This makes it overestimate the cost for the scalar loop and we incorrectly vectorize. This patch takes the minimum of the VF and niters in such cases. Before the patch we generate: note: Original vector body cost = 46 note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 32) = 32.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Low iteration count, so using pure latency costs note: Cost model analysis: vs after: note: Original vector body cost = 46 note: Known loop bounds, capping VF to 9 for analysis note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 9) = 9.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Increasing body cost to 1472 because the scalar code could issue within the limit imposed by predicate operations note: Low iteration count, so using pure latency costs note: Cost model analysis: gcc/ChangeLog: * config/aarch64/aarch64.cc (adjust_body_cost): Cap VF for low iteration loops. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/asrdiv_4.c: Update bounds. * gcc.target/aarch64/sve/cond_asrd_2.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_6.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_7.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_8.c: Likewise. * gcc.target/aarch64/sve/miniloop_1.c: Likewise. * gcc.target/aarch64/sve/spill_6.c: Likewise. * gcc.target/aarch64/sve/sve_iters_low_1.c: New test. * gcc.target/aarch64/sve/sve_iters_low_2.c: New test. Diff: --- gcc/config/aarch64/aarch64.cc | 13 +++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c | 12 ++++++------ gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c | 12 ++++++------ gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c | 8 ++++---- gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c | 8 ++++---- gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c | 8 ++++---- gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c | 2 +- gcc/testsuite/gcc.target/aarch64/sve/spill_6.c | 8 ++++---- .../gcc.target/aarch64/sve/sve_iters_low_1.c | 17 +++++++++++++++++ .../gcc.target/aarch64/sve/sve_iters_low_2.c | 20 ++++++++++++++++++++ 10 files changed, 79 insertions(+), 29 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 92763d403c75..68913beaee20 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17565,6 +17565,19 @@ adjust_body_cost (loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "Original vector body cost = %d\n", body_cost); + /* If we know we have a single partial vector iteration, cap the VF + to the number of scalar iterations for costing purposes. */ + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo); + if (niters < estimated_vf && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Scalar loop iterates at most %wd times. Capping VF " + " from %d to %wd\n", niters, estimated_vf, niters); + + estimated_vf = MIN (estimated_vf, niters); + } + fractional_cost scalar_cycles_per_iter = scalar_ops.min_cycles_per_iter () * estimated_vf; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c index 6684fe1c1244..10a96a894afd 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c @@ -15,12 +15,12 @@ } #define TEST_ALL(T) \ - T (int16_t, int8_t, 7) \ - T (int32_t, int8_t, 3) \ - T (int32_t, int16_t, 3) \ - T (int64_t, int8_t, 5) \ - T (int64_t, int16_t, 5) \ - T (int64_t, int32_t, 5) + T (int16_t, int8_t, 70) \ + T (int32_t, int8_t, 30) \ + T (int32_t, int16_t, 30) \ + T (int64_t, int8_t, 50) \ + T (int64_t, int16_t, 50) \ + T (int64_t, int32_t, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c index e4040ee3520c..db1721efbc7b 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c @@ -14,12 +14,12 @@ } #define TEST_ALL(T) \ - T (int16_t, int8_t, 7) \ - T (int32_t, int8_t, 3) \ - T (int32_t, int16_t, 3) \ - T (int64_t, int8_t, 5) \ - T (int64_t, int16_t, 5) \ - T (int64_t, int32_t, 5) + T (int16_t, int8_t, 70) \ + T (int32_t, int8_t, 30) \ + T (int32_t, int16_t, 30) \ + T (int64_t, int8_t, 50) \ + T (int64_t, int16_t, 50) \ + T (int64_t, int32_t, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c index e47276a3a352..b8b3e862d0a1 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c index f49915c4ac14..2d02fb70f33f 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c index 42eb4b2661b3..8fe2455687b5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c @@ -14,11 +14,11 @@ } #define TEST_ALL(T) \ - T (int32_t, uint16_t, 0xff, 3) \ + T (int32_t, uint16_t, 0xff, 30) \ \ - T (int64_t, uint16_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xff, 5) \ - T (int64_t, uint32_t, 0xffff, 5) + T (int64_t, uint16_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xff, 50) \ + T (int64_t, uint32_t, 0xffff, 50) TEST_ALL (DEF_LOOP) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c index 09eb4146816c..cd1fd2b8a078 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c @@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c, int * __restrict__ g, int * __restrict__ h) { int i = 0; - for (i = 0; i < 3; i++) + for (i = 0; i < 30; i++) { a[i] += i; b[i] += i; diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c index ae9c338f5696..2ff969ced009 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c @@ -11,20 +11,20 @@ void consumer (void *); { \ if (which) \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x1[i] += VAL; \ consumer (x1); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x2[i] -= VAL; \ consumer (x2); \ } \ else \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x3[i] &= VAL; \ consumer (x3); \ } \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 70; ++i) \ x4[i] |= VAL; \ consumer (x4); \ } diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c new file mode 100644 index 000000000000..952a4b1cd580 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */ + +void +foo (char *restrict a, int *restrict b, int *restrict c, int n) +{ + for (int i = 0; i < 9; i++) + { + int res = c[i]; + int t = b[i]; + if (a[i] != 0) + res = t; + c[i] = res; + } +} + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c new file mode 100644 index 000000000000..02d10de2a621 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */ + +void +foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride) +{ + if (stride <= 1) + return; + + for (int i = 0; i < 9; i++) + { + int res = c[i]; + int t = b[i*stride]; + if (a[i] != 0) + res = t; + c[i] = res; + } +} + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */