We vectorise: int f (int *a, int n) { int count = 0; for (int i = 0; i < n; ++i) if (a[i] < 255) count += 1; return count; }
using an add reduction of a VEC_COND_EXPR <COND, {1, 1, ...}, {0, 0, ...}>. This leads to the main loop having an AND with a loop invariant {1, 1, ...}. E.g. on aarch64: movi v2.4s, 0x1 .L4: lsl x5, x4, 4 add x4, x4, 1 cmp w2, w4 ldr q1, [x0, x5] cmge v1.4s, v3.4s, v1.4s and v1.16b, v2.16b, v1.16b add v0.4s, v0.4s, v1.4s bhi .L4 This patch converts an ADD of that VEC_COND_EXPR into a SUB of COND: .L4: lsl x5, x4, 4 add x4, x4, 1 cmp w2, w4 ldr q1, [x0, x5] cmge v1.4s, v2.4s, v1.4s sub v0.4s, v0.4s, v1.4s bhi .L4 At the moment the simplification is done during forwprop4, after the last dce pass, and so the VEC_COND_EXPR survives until expand. Richi says this a known problem. Of course, the expression gets deleted by rtl dce, but it means that a scan-tree-dump of the *.optimized output can't easily tell that the optimisation has triggered. I've therefore added a scan-assembler test instead. Bootstrapped & regression-tested on x86_64-linux-gnu. Also tested on aarch64-elf. OK to install? Thanks, Richard gcc/ * match.pd: Add patterns for vec_conds between -1 and 0, and between 1 and 0. gcc/testsuite/ * gcc.target/aarch64/vect-add-sub-cond.c: New test. Index: gcc/match.pd =================================================================== --- gcc/match.pd 2015-06-23 11:42:23.644645975 +0100 +++ gcc/match.pd 2015-06-23 11:42:23.760644655 +0100 @@ -973,6 +973,36 @@ along with GCC; see the file COPYING3. (cnd @0 @2 @1))) +/* Vector comparisons are defined to produce all-one or all-zero results. */ +(simplify + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (convert @0))) + +/* We could instead convert all instances of the vec_cond to negate, + but that isn't necessarily a win on its own. */ +(simplify + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (minus @3 (convert @0)))) + +(simplify + (plus:c @3 (view_convert_expr + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (minus @3 (convert @0)))) + +(simplify + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (plus @3 (convert @0)))) + +(simplify + (minus @3 (view_convert_expr + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (plus @3 (convert @0)))) + /* Simplifications of comparisons. */ /* We can simplify a logical negation of a comparison to the Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c =================================================================== --- /dev/null 2015-06-02 17:27:28.541944012 +0100 +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-23 12:06:27.120203685 +0100 @@ -0,0 +1,94 @@ +/* Make sure that vector comaprison results are not unnecessarily ANDed + with vectors of 1. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#define COUNT1(X) if (X) count += 1 +#define COUNT2(X) if (X) count -= 1 +#define COUNT3(X) count += (X) +#define COUNT4(X) count -= (X) + +#define COND1(X) (X) +#define COND2(X) ((X) ? 1 : 0) +#define COND3(X) ((X) ? -1 : 0) +#define COND4(X) ((X) ? 0 : 1) +#define COND5(X) ((X) ? 0 : -1) + +#define TEST_LT(X, Y) ((X) < (Y)) +#define TEST_LE(X, Y) ((X) <= (Y)) +#define TEST_GT(X, Y) ((X) > (Y)) +#define TEST_GE(X, Y) ((X) >= (Y)) +#define TEST_EQ(X, Y) ((X) == (Y)) +#define TEST_NE(X, Y) ((X) != (Y)) + +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ + TYPE \ + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + TYPE count = 0; \ + for (unsigned int i = 0; i < 1024; ++i) \ + COUNT (TEST (CMP_ARRAY[i], x)); \ + return count; \ + } + +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ + void \ + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + plusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ + } \ + void \ + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + minusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ + } + +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ + typedef __typeof__(ARRAY[0]) ID##_type; \ + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) + +signed int asi[1024] __attribute__ ((aligned (16))); +unsigned int aui[1024] __attribute__ ((aligned (16))); +signed long long asl[1024] __attribute__ ((aligned (16))); +unsigned long long aul[1024] __attribute__ ((aligned (16))); +float af[1024] __attribute__ ((aligned (16))); +double ad[1024] __attribute__ ((aligned (16))); + +ALL_LOOPS (si_si, aui, asi, TEST_LT) +ALL_LOOPS (ui_si, aui, asi, TEST_LE) +ALL_LOOPS (si_ui, aui, asi, TEST_GT) +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) +ALL_LOOPS (ul_ul, aul, aul, TEST_NE) +ALL_LOOPS (si_f, asi, af, TEST_LE) +ALL_LOOPS (ui_f, aui, af, TEST_GT) +ALL_LOOPS (sl_d, asl, ad, TEST_GE) +ALL_LOOPS (ul_d, aul, ad, TEST_GT) + +/* { dg-final { scan-assembler-not "\tand\t" } } */ +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler "\tldr\tq" } } */ +/* { dg-final { scan-assembler "\tstr\tq" } } */