Hi, the attached patch simplifies vector conditional statements like v < 0 ? -1 : 0 into v >> 31. The code is largely based on the x86 implementation of this feature by Jakub Jelinek. In future, (and if useful for more backends) it could make sense to implement this directly at tree-level.
Bootstrapped and regression-tested on s390. Regards Robin gcc/ChangeLog: 2015-12-15 Robin Dapp <rd...@linux.vnet.ibm.com> * config/s390/s390.c (s390_expand_vcond): Convert vector conditional into shift. * config/s390/vector.md: Change operand predicate. gcc/testsuite/ChangeLog: 2015-12-15 Robin Dapp <rd...@linux.vnet.ibm.com> * gcc.target/s390/vcond-shift.c: New test to check vcond simplification.
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 67639bc..a72c9e1 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -6108,19 +6108,60 @@ s390_expand_vcond (rtx target, rtx then, rtx els, machine_mode result_mode; rtx result_target; + machine_mode target_mode = GET_MODE (target); + machine_mode cmp_mode = GET_MODE (cmp_op1); + rtx op = (cond == LT) ? els : then; + + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 + and x < 0 ? 1 : 0 into (unsigned) x >> 31. Likewise + for short and byte (x >> 15 and x >> 7 respectively). */ + if ((cond == LT || cond == GE) + && target_mode == cmp_mode + && cmp_op2 == CONST0_RTX (cmp_mode) + && op == CONST0_RTX (target_mode) + && s390_vector_mode_supported_p (target_mode) + && GET_MODE_CLASS (target_mode) == MODE_VECTOR_INT) + { + rtx negop = (cond == LT) ? then : els; + + int shift = GET_MODE_BITSIZE (GET_MODE_INNER (target_mode)) - 1; + + /* if x < 0 ? 1 : 0 or if x >= 0 ? 0 : 1 */ + if (negop == CONST1_RTX (target_mode)) + { + rtx res = expand_simple_binop (cmp_mode, LSHIFTRT, cmp_op1, + GEN_INT (shift), target, + 1, OPTAB_DIRECT); + if (res != target) + emit_move_insn (target, res); + return; + } + + /* if x < 0 ? -1 : 0 or if x >= 0 ? 0 : -1 */ + else if (constm1_operand (negop, target_mode)) + { + rtx res = expand_simple_binop (cmp_mode, ASHIFTRT, cmp_op1, + GEN_INT (shift), target, + 0, OPTAB_DIRECT); + if (res != target) + emit_move_insn (target, res); + return; + } + } + /* We always use an integral type vector to hold the comparison result. */ - result_mode = GET_MODE (cmp_op1) == V2DFmode ? V2DImode : GET_MODE (cmp_op1); + result_mode = cmp_mode == V2DFmode ? V2DImode : cmp_mode; result_target = gen_reg_rtx (result_mode); - /* Alternatively this could be done by reload by lowering the cmp* - predicates. But it appears to be better for scheduling etc. to - have that in early. */ + /* We allow vector immediates as comparison operands that + can be handled by the optimization above but not by the + following code. Hence, force them into registers here. */ if (!REG_P (cmp_op1)) - cmp_op1 = force_reg (GET_MODE (target), cmp_op1); + cmp_op1 = force_reg (target_mode, cmp_op1); if (!REG_P (cmp_op2)) - cmp_op2 = force_reg (GET_MODE (target), cmp_op2); + cmp_op2 = force_reg (target_mode, cmp_op2); s390_expand_vec_compare (result_target, cond, cmp_op1, cmp_op2); @@ -6130,7 +6171,7 @@ s390_expand_vcond (rtx target, rtx then, rtx els, if (constm1_operand (then, GET_MODE (then)) && const0_operand (els, GET_MODE (els))) { - emit_move_insn (target, gen_rtx_SUBREG (GET_MODE (target), + emit_move_insn (target, gen_rtx_SUBREG (target_mode, result_target, 0)); return; } @@ -6139,10 +6180,10 @@ s390_expand_vcond (rtx target, rtx then, rtx els, /* This gets triggered e.g. with gcc.c-torture/compile/pr53410-1.c */ if (!REG_P (then)) - then = force_reg (GET_MODE (target), then); + then = force_reg (target_mode, then); if (!REG_P (els)) - els = force_reg (GET_MODE (target), els); + els = force_reg (target_mode, els); tmp = gen_rtx_fmt_ee (EQ, VOIDmode, result_target, @@ -6150,9 +6191,9 @@ s390_expand_vcond (rtx target, rtx then, rtx els, /* We compared the result against zero above so we have to swap then and els here. */ - tmp = gen_rtx_IF_THEN_ELSE (GET_MODE (target), tmp, els, then); + tmp = gen_rtx_IF_THEN_ELSE (target_mode, tmp, els, then); - gcc_assert (GET_MODE (target) == GET_MODE (then)); + gcc_assert (target_mode == GET_MODE (then)); emit_insn (gen_rtx_SET (target, tmp)); } diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index c9f5890..f6a85c8 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -403,7 +403,7 @@ (if_then_else:V_HW (match_operator 3 "comparison_operator" [(match_operand:V_HW2 4 "register_operand" "") - (match_operand:V_HW2 5 "register_operand" "")]) + (match_operand:V_HW2 5 "nonmemory_operand" "")]) (match_operand:V_HW 1 "nonmemory_operand" "") (match_operand:V_HW 2 "nonmemory_operand" "")))] "TARGET_VX && GET_MODE_NUNITS (<V_HW:MODE>mode) == GET_MODE_NUNITS (<V_HW2:MODE>mode)" @@ -418,7 +418,7 @@ (if_then_else:V_HW (match_operator 3 "comparison_operator" [(match_operand:V_HW2 4 "register_operand" "") - (match_operand:V_HW2 5 "register_operand" "")]) + (match_operand:V_HW2 5 "nonmemory_operand" "")]) (match_operand:V_HW 1 "nonmemory_operand" "") (match_operand:V_HW 2 "nonmemory_operand" "")))] "TARGET_VX && GET_MODE_NUNITS (<V_HW:MODE>mode) == GET_MODE_NUNITS (<V_HW2:MODE>mode)" diff --git a/gcc/testsuite/gcc.target/s390/vcond-shift.c b/gcc/testsuite/gcc.target/s390/vcond-shift.c new file mode 100644 index 0000000..f58bd1f --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vcond-shift.c @@ -0,0 +1,61 @@ +/* Check if conditional vector instructions are simplified + into shift operations. */ +/* { dg-do compile { target { s390*-*-* } } } */ +/* { dg-options "-O3 -march=z13 -mzarch" } */ + +/* { dg-final { scan-assembler "vesraf\t%v.?,%v.?,31" } } */ +/* { dg-final { scan-assembler "vesrah\t%v.?,%v.?,15" } } */ +/* { dg-final { scan-assembler "vesrab\t%v.?,%v.?,7" } } */ +/* { dg-final { scan-assembler-not "vzero\t*" } } */ +/* { dg-final { scan-assembler "vesrlf\t%v.?,%v.?,31" } } */ +/* { dg-final { scan-assembler "vesrlh\t%v.?,%v.?,15" } } */ +/* { dg-final { scan-assembler "vesrlb\t%v.?,%v.?,7" } } */ + +#define SZ 4 +#define SZ2 8 +#define SZ3 16 + +void foo(int *w) +{ + int i; + /* Should expand to (w + (w < 0 ? 1 : 0)) >> 1 + which in turn should get simplified to (w + (w >> 31)) >> 1. */ + for (i = 0; i < SZ; i++) + w[i] = w[i] / 2; +} + +void foo2(short *w) +{ + int i; + for (i = 0; i < SZ2; i++) + w[i] = w[i] / 2; +} + + +void foo3(signed char *w) +{ + int i; + for (i = 0; i < SZ3; i++) + w[i] = w[i] / 2; +} + +int baz(int *x) +{ + int i; + for (i = 0; i < SZ; i++) + x[i] = x[i] < 0 ? -1 : 0; +} + +int baf(short *x) +{ + int i; + for (i = 0; i < SZ2; i++) + x[i] = x[i] >= 0 ? 0 : 1; +} + +int bal(signed char *x) +{ + int i; + for (i = 0; i < SZ3; i++) + x[i] = x[i] >= 0 ? 0 : -1; +}