On Mon, Jun 15, 2020 at 09:29:29PM +0800, Hongtao Liu via Gcc-patches wrote:
> Basically i "copy" this optimization from clang i386 backend, Refer
> to pr95524 for details.
> Bootstrap is ok, regression test on i386/x86-64 backend is ok.
>
> gcc/ChangeLog:
> PR target/95524
> * gcc/config/i386/i386-expand.c
> (ix86_expand_vec_shift_qihi_constant): New function.
No gcc/ prefix in gcc/ChangeLog (git push would be refused).
And with or without the prefix it fits, so:
* config/i386/i386-expand.c (ix86_expand_vec_shift_qihi_constant): New
function.
> * gcc/config/i386/i386-protos.h: Declare.
This needs to mention the function name again.
> * gcc/config/i386/sse.md: Optimize shift V*QImode by constant.
And thus needs to mention the define_expand, (<shift_insn><mode>3)
in this case.
+ machine_mode qimode, himode;
+ unsigned int shift_constant, and_constant, xor_constant;
+ rtx vec_const_and, vec_const_xor;
+ rtx tmp, op1_subreg;
+ rtx (*gen_shift) (rtx, rtx, rtx);
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_xor) (rtx, rtx, rtx);
+ rtx (*gen_sub) (rtx, rtx, rtx);
+
+ /* Only optimize shift by constant. */
+ if (!CONST_INT_P (op2))
+ return false;
+
+ qimode = GET_MODE (dest);
+ shift_constant = INTVAL (op2);
I wonder if shift_constant shouldn't be unsigned HOST_WIDE_INT
instead, and which >= 8 values you should try to do something about rather
than punt on optimizing. If this is just from normal C shifts, then
perhaps anything >= 32U would be invalid, if it is from intrinsics,
perhaps there is a different behavior (masked with something?).
+ /* Shift constant greater equal 8 result into 0. */
+ if (shift_constant > 7)
+ {
+ if (code == ASHIFT || code == LSHIFTRT)
+ {
+ emit_move_insn (dest, CONST0_RTX (qimode));
+ return true;
+ }
+ /* Sign bit not known. */
+ else if (code == ASHIFTRT)
+ return false;
+ else
+ gcc_unreachable ();
+ }
+
+ gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+ /* Record sign bit. */
+ xor_constant = 1 << (8 - shift_constant - 1);
+
+ /* Zero upper/lower bits shift from left/right element. */
+ and_constant = code == ASHIFT ? 256 - (1 << shift_constant) :
+ (1 << (8 - shift_constant)) - 1;
Formatting. Should be:
and_constant
= (code == ASHIFT ? 256 - (1 << shift_constant)
: (1 << (8 - shift_constant)) - 1);
or so.
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_shift = (code == ASHIFT) ? gen_ashlv8hi3 :
+ (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3;
Similarly. : or ? should just never appear at the end of line.
And probably no reason to wrap the comparisons in parens,
instead wrap the whole expression. So:
gen_shift
= (code == ASHIFT
? gen_ashlv8hi3
: code == ASHIFTRT ? gen_ashrv8hi3 : gen_lshrv8hi3);
or so?
+ gen_and = gen_andv16qi3;
+ gen_xor = gen_xorv16qi3;
+ gen_sub = gen_subv16qi3;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_shift = (code == ASHIFT) ? gen_ashlv16hi3 :
+ (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3;
Ditto.
+ gen_and = gen_andv32qi3;
+ gen_xor = gen_xorv32qi3;
+ gen_sub = gen_subv32qi3;
+ break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_shift = (code == ASHIFT) ? gen_ashlv32hi3 :
+ (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3;
Ditto.
+ tmp = gen_reg_rtx (himode);
+ vec_const_and = gen_reg_rtx (qimode);
+ op1_subreg = simplify_gen_subreg (himode, op1, qimode, 0);
Just use lowpart_subreg?
+
+ /* For ASHIFT and LSHIFTRT, perform operation like
+ vpsllw/vpsrlw $shift_constant, %op1, %dest.
+ vpand %vec_const_and, %dest. */
+ emit_insn (gen_shift (tmp, op1_subreg, op2));
+ emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+ emit_move_insn (vec_const_and,
+ ix86_build_const_vector (qimode, true,
+ GEN_INT (and_constant)));
+ emit_insn (gen_and (dest, dest, vec_const_and));
+
+ /* For ASHIFTRT, perform extra operation like
+ vpxor %vec_const_xor, %dest, %dest
+ vpsubb %vec_const_xor, %dest, %dest */
+ if (code == ASHIFTRT)
+ {
+ vec_const_xor = gen_reg_rtx (qimode);
+ emit_move_insn (vec_const_xor,
+ ix86_build_const_vector (qimode, true,
+ GEN_INT (xor_constant)));
+ emit_insn (gen_xor (dest, dest, vec_const_xor));
+ emit_insn (gen_sub (dest, dest, vec_const_xor));
+ }
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index f5320494fa1..7c2ce618f3f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -206,6 +206,7 @@ extern void ix86_expand_round_sse4 (rtx, rtx);
extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx);
extern rtx ix86_split_stack_guard (void);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index aa9fdc87c68..b466950af40 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -19863,6 +19863,9 @@
gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
emit_insn (gen (operands[0], operands[1], tmp));
}
+ else if (ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
+ operands[1], operands[2]))
+ DONE;
Perhaps do instead:
else if (!ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
operands[1], operands[2]))
ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
DONE;
?
Jakub