Hi, As Ramana hinted here: https://gcc.gnu.org/ml/gcc-patches/2014-11/msg00607.html
There are two issues with the way we've defined our BSL pattern. We pun types around in a way that is scary and quite likely unsafe, and we haven't canonicalized the pattern so combine is unlikely to pick it up. This patch fixes both of these issues and adds testcases to ensure we are picking up the combine opportunity. I've bootstrapped and tested this on aarch64-none-linux-gnu and cross-tested it for aarch64-none-elf. OK? Cheers, James --- gcc/ 2014-11-11 James Greenhalgh <james.greenha...@arm.com> * config/aarch64/aarch64-simd.md (aarch64_simd_bsl<mode>_internal): Remove float cases, canonicalize. (aarch64_simd_bsl<mode>): Add gen_lowpart expressions where we are punning between float vectors and integer vectors. gcc/testsuite/ 2014-11-11 James Greenhalgh <james.greenha...@arm.com> * gcc.target/aarch64/vbslq_f64_1.c: New. * gcc.target/aarch64/vbslq_f64_2.c: Likewise. * gcc.target/aarch64/vbslq_u64_1.c: Likewise. * gcc.target/aarch64/vbslq_u64_2.c: Likewise.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ef196e4b6fb39c0d2fd9ebfee76abab8369b1e92..f7012ecab07c1b38836e949c2f4e5bd0c7939b5c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1924,15 +1924,15 @@ (define_insn "aarch64_reduc_<maxmin_uns> ;; bif op0, op1, mask (define_insn "aarch64_simd_bsl<mode>_internal" - [(set (match_operand:VALLDIF 0 "register_operand" "=w,w,w") - (ior:VALLDIF - (and:VALLDIF - (match_operand:<V_cmp_result> 1 "register_operand" " 0,w,w") - (match_operand:VALLDIF 2 "register_operand" " w,w,0")) - (and:VALLDIF + [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") + (ior:VSDQ_I_DI + (and:VSDQ_I_DI (not:<V_cmp_result> - (match_dup:<V_cmp_result> 1)) - (match_operand:VALLDIF 3 "register_operand" " w,0,w")) + (match_operand:<V_cmp_result> 1 "register_operand" " 0,w,w")) + (match_operand:VSDQ_I_DI 3 "register_operand" " w,0,w")) + (and:VSDQ_I_DI + (match_dup:<V_cmp_result> 1) + (match_operand:VSDQ_I_DI 2 "register_operand" " w,w,0")) ))] "TARGET_SIMD" "@ @@ -1950,9 +1950,21 @@ (define_expand "aarch64_simd_bsl<mode>" "TARGET_SIMD" { /* We can't alias operands together if they have different modes. */ + rtx tmp = operands[0]; + if (FLOAT_MODE_P (<MODE>mode)) + { + operands[2] = gen_lowpart (<V_cmp_result>mode, operands[2]); + operands[3] = gen_lowpart (<V_cmp_result>mode, operands[3]); + tmp = gen_reg_rtx (<V_cmp_result>mode); + } operands[1] = gen_lowpart (<V_cmp_result>mode, operands[1]); - emit_insn (gen_aarch64_simd_bsl<mode>_internal (operands[0], operands[1], - operands[2], operands[3])); + emit_insn (gen_aarch64_simd_bsl<v_cmp_result>_internal (tmp, + operands[1], + operands[2], + operands[3])); + if (tmp != operands[0]) + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp)); + DONE; }) diff --git a/gcc/testsuite/gcc.target/aarch64/vbslq_f64_1.c b/gcc/testsuite/gcc.target/aarch64/vbslq_f64_1.c new file mode 100644 index 0000000..7b0e8f9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vbslq_f64_1.c @@ -0,0 +1,20 @@ +/* Test vbslq_f64 can be folded. */ +/* { dg-do assemble } */ +/* { dg-options "--save-temps -O3" } */ + +#include <arm_neon.h> + +/* Folds to ret. */ + +float32x4_t +fold_me (float32x4_t a, float32x4_t b) +{ + uint32x4_t mask = {-1, -1, -1, -1}; + return vbslq_f32 (mask, a, b); +} + +/* { dg-final { scan-assembler-not "bsl\\tv" } } */ +/* { dg-final { scan-assembler-not "bit\\tv" } } */ +/* { dg-final { scan-assembler-not "bif\\tv" } } */ + +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vbslq_f64_2.c b/gcc/testsuite/gcc.target/aarch64/vbslq_f64_2.c new file mode 100644 index 0000000..1dca90d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vbslq_f64_2.c @@ -0,0 +1,23 @@ +/* Test vbslq_f64 can be folded. */ +/* { dg-do assemble } */ +/* { dg-options "--save-temps -O3" } */ + +#include <arm_neon.h> + +/* Should fold out one half of the BSL, leaving just a BIC. */ + +float32x4_t +half_fold_me (uint32x4_t mask) +{ + float32x4_t a = {0.0, 0.0, 0.0, 0.0}; + float32x4_t b = {2.0, 4.0, 8.0, 16.0}; + return vbslq_f32 (mask, a, b); + +} + +/* { dg-final { scan-assembler-not "bsl\\tv" } } */ +/* { dg-final { scan-assembler-not "bit\\tv" } } */ +/* { dg-final { scan-assembler-not "bif\\tv" } } */ +/* { dg-final { scan-assembler "bic\\tv" } } */ + +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vbslq_u64_1.c b/gcc/testsuite/gcc.target/aarch64/vbslq_u64_1.c new file mode 100644 index 0000000..9c61d1a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vbslq_u64_1.c @@ -0,0 +1,16 @@ +/* Test if a BSL-like instruction can be generated from a C idiom. */ +/* { dg-do assemble } */ +/* { dg-options "--save-temps -O3" } */ + +#include <arm_neon.h> + +/* Folds to BIF. */ + +uint32x4_t +vbslq_dummy_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t mask) +{ + return (mask & a) | (~mask & b); +} + +/* { dg-final { scan-assembler-times "bif\\tv" 1 } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vbslq_u64_2.c b/gcc/testsuite/gcc.target/aarch64/vbslq_u64_2.c new file mode 100644 index 0000000..4540351 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vbslq_u64_2.c @@ -0,0 +1,21 @@ +/* Test vbslq_u64 can be folded. */ +/* { dg-do assemble } */ +/* { dg-options "--save-temps -O3" } */ +#include <arm_neon.h> + +/* Folds to BIC. */ + +int32x4_t +half_fold_int (uint32x4_t mask) +{ + int32x4_t a = {0, 0, 0, 0}; + int32x4_t b = {2, 4, 8, 16}; + return vbslq_s32 (mask, a, b); +} + +/* { dg-final { scan-assembler-not "bsl\\tv" } } */ +/* { dg-final { scan-assembler-not "bit\\tv" } } */ +/* { dg-final { scan-assembler-not "bif\\tv" } } */ +/* { dg-final { scan-assembler "bic\\tv" } } */ + +/* { dg-final { cleanup-saved-temps } } */