From: Andi Kleen <a...@gcc.gnu.org> The GFNI AVX gf2p8affineqb instruction can be used to implement vectorized byte shifts or rotates. This patch uses them to implement shift and rotate patterns to allow the vectorizer to use them. Previously AVX couldn't do rotates (except with XOP) and had to handle 8 bit shifts with a half throughput 16 bit shift.
This is only implemented for constant shifts. In theory it could be used with a lookup table for variable shifts, but it's unclear if it's worth it. The vectorizer cost model could be improved, but seems to work for now. It doesn't model the true latencies of the instructions. Also it doesn't accout for the memory loading of the mask, assuming that for a loop it will be loaded outside the loop. The instructions would also support more complex patterns (e.g. arbitary bit movement or inversions), so some of the tricks applied to ternlog could be applied here too to collapse more code. It's trickier because the input patterns can be much longer since they can apply to every bit individually. I didn't attempt any of this. There's currently no test case for the masked variants, they seem to be difficult to trigger with the vectorizer. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_vgf2p8affine_shift_matrix): New function to compute shift/rotate matrixes for gf2p8affine. (ix86_expand_multi_arg_builtin): Weaken predicate assert check to handle new more general pattern. * config/i386/i386-protos.h (ix86_vgf2p8affine_shift_matrix): Declare new function. * config/i386/i386.cc (ix86_shift_rotate_cost): Add cost model for shift/rotate implemented using gf2p8affine. * config/i386/sse.md (<insn><mode>3_mask): Add GFNI case. (<insn><mode>3<mask_name>): New pattern for GFNI. Handle V16QI case for XOP rotate. (rotl<mode>3, rotr<mode>3): Exclude V16QI case. gcc/testsuite/ChangeLog: * gcc.target/i386/shift-gf2p8affine-1.c: New test * gcc.target/i386/shift-gf2p8affine-2.c: New test * gcc.target/i386/shift-gf2p8affine-3.c: New test --- gcc/config/i386/i386-expand.cc | 113 +++++++++- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.cc | 14 ++ gcc/config/i386/sse.md | 79 ++++++- .../gcc.target/i386/shift-gf2p8affine-1.c | 64 ++++++ .../gcc.target/i386/shift-gf2p8affine-2.c | 196 ++++++++++++++++++ .../gcc.target/i386/shift-gf2p8affine-3.c | 87 ++++++++ 7 files changed, 543 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 09aa9b1461cc..ccc058228c74 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -10784,15 +10784,15 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, } else { + /* The predicates may not match for the xop_rotlv16qi3 + case because the new shared pattern uses a + nonimmediate vs a register operand. Since it's a super + set ignore the mismatch. */ gcc_checking_assert (nargs == 2 && insn_data[new_icode].operand[0].mode == tmode && insn_data[new_icode].operand[1].mode == tmode - && insn_data[new_icode].operand[2].mode == mode - && insn_data[new_icode].operand[0].predicate - == insn_data[icode].operand[0].predicate - && insn_data[new_icode].operand[1].predicate - == insn_data[icode].operand[1].predicate); + && insn_data[new_icode].operand[2].mode == mode); icode = new_icode; goto non_constant; } @@ -27033,6 +27033,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, return target; } +/* GF2P8AFFINEQB matrixes to implement shift and rotate. */ + +static const uint64_t matrix_ashift[8] = +{ + 0, + 0x0001020408102040, /* 1 l */ + 0x0000010204081020, /* 2 l */ + 0x0000000102040810, /* 3 l */ + 0x0000000001020408, /* 4 l */ + 0x0000000000010204, /* 5 l */ + 0x0000000000000102, /* 6 l */ + 0x0000000000000001 /* 7 l */ +}; + +static const uint64_t matrix_lshiftrt[8] = +{ + 0, + 0x0204081020408000, /* 1 r */ + 0x0408102040800000, /* 2 r */ + 0x0810204080000000, /* 3 r */ + 0x1020408000000000, /* 4 r */ + 0x2040800000000000, /* 5 r */ + 0x4080000000000000, /* 6 r */ + 0x8000000000000000 /* 7 r */ +}; + +static const uint64_t matrix_ashiftrt[8] = +{ + 0, + 0x0204081020408080, /* 1 r */ + 0x0408102040808080, /* 2 r */ + 0x0810204080808080, /* 3 r */ + 0x1020408080808080, /* 4 r */ + 0x2040808080808080, /* 5 r */ + 0x4080808080808080, /* 6 r */ + 0x8080808080808080 /* 7 r */ +}; + +static const uint64_t matrix_rotate[8] = +{ + 0, + 0x8001020408102040, /* 1 rol8 */ + 0x4080010204081020, /* 2 rol8 */ + 0x2040800102040810, /* 3 rol8 */ + 0x1020408001020408, /* 4 rol8 */ + 0x0810204080010204, /* 5 rol8 */ + 0x0408102040800102, /* 6 rol8 */ + 0x0204081020408001 /* 7 rol8 */ +}; + +static const uint64_t matrix_rotatert[8] = +{ + 0, + 0x0204081020408001, /* 1 ror8 */ + 0x0408102040800102, /* 2 ror8 */ + 0x0810204080010204, /* 3 ror8 */ + 0x1020408001020408, /* 4 ror8 */ + 0x2040800102040810, /* 5 ror8 */ + 0x4080010204081020, /* 6 ror8 */ + 0x8001020408102040 /* 7 ror8 */ +}; + +/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift + for CODE and shift count COUNT into register with vector of size of SRC. */ + +rtx +ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code) +{ + machine_mode mode = GET_MODE (src); + const uint64_t *matrix; + unsigned shift = INTVAL (count) & 7; + gcc_assert (shift > 0 && shift < 8); + + switch (code) + { + case ASHIFT: + matrix = matrix_ashift; + break; + case ASHIFTRT: + matrix = matrix_ashiftrt; + break; + case LSHIFTRT: + matrix = matrix_lshiftrt; + break; + case ROTATE: + matrix = matrix_rotate; + break; + case ROTATERT: + matrix = matrix_rotatert; + break; + default: + gcc_unreachable (); + } + + int nelts = GET_MODE_NUNITS (mode); + rtvec vec = rtvec_alloc (nelts); + uint64_t ma = matrix[shift]; + for (int i = 0; i < nelts; i++) + RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode); + + return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec)); +} + /* Trunc a vector to a narrow vector, like v4di -> v4si. */ void diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 69bc0ee570dd..279f6ffdf26a 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap); /* In i386-expand.cc. */ bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, HOST_WIDE_INT*); +rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 65e04d3760d5..38a160709bd4 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22102,6 +22102,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, } /* FALLTHRU */ case V32QImode: + if (TARGET_GFNI && constant_op1) + { + /* Use vgf2p8affine. One extra load for the mask, but in a loop + with enough registers it will be moved out. So for now don't + account the constant mask load. This is not quite right + for non loop vectorization. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + } if (TARGET_AVX2) /* Use vpbroadcast. */ extra = cost->sse_op; @@ -22136,6 +22145,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 9; return ix86_vec_cost (mode, cost->sse_op * count) + extra; + case V64QImode: + /* Ignore the mask load for GF2P8AFFINEQB. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + case V2DImode: case V4DImode: /* V*DImode arithmetic right shift is emulated. */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ec74f93731d8..01b8d91c5d55 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -26559,9 +26559,9 @@ (define_insn "xop_pperm_pack_v8hi_v16qi" ;; XOP packed rotate instructions (define_expand "rotl<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotate:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotate:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26590,9 +26590,9 @@ (define_expand "rotl<mode>3" }) (define_expand "rotr<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotatert:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotatert:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26984,11 +26984,78 @@ (define_expand "<insn><mode>3" gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } + else if (TARGET_GFNI && CONST_INT_P (operands[2])) + { + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], + <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, + GEN_INT (0))); + } else ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); DONE; }) +(define_expand "<insn><mode>3_mask" + [(set (match_operand:VI1_AVX512 0 "register_operand") + (any_shift:VI1_AVX512 + (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:SI 2 "const_int_operand"))) + (match_operand:VI1_AVX512 3 "nonimm_or_0_operand") + (match_operand:<avx512fmaskmode> 4 "register_operand")] + "TARGET_GFNI" +{ + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], matrix, + GEN_INT (0), operands[3], operands[4])); + DONE; +}) + +(define_expand "<insn><mode>3<mask_name>" + [(set (match_operand:VI1_AVX512 0 "register_operand") + (any_rotate:VI1_AVX512 + (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:SI 2 "const_int_operand")))] + "" +{ + /* Handle the V16QI XOP case to avoid a conflict with the other expand. */ + if (TARGET_XOP && ARRAY_SIZE (operands) <= 3 && <MODE>mode == V16QImode + && ! const_0_to_7_operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != QImode) + { + op2 = gen_reg_rtx (QImode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_initv16qiqi (reg, par)); + if (<CODE> == ROTATERT) + { + rtx neg = gen_reg_rtx (<MODE>mode); + emit_insn (gen_neg<mode>2 (neg, reg)); + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg)); + reg = neg; + } + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg)); + DONE; + } + else if (!TARGET_GFNI) + FAIL; + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode><mask_name> (operands[0], operands[1], matrix, + GEN_INT (0) <mask_operand_arg34>)); + DONE; +}) + (define_expand "ashrv2di3" [(set (match_operand:V2DI 0 "register_operand") (ashiftrt:V2DI diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c new file mode 100644 index 000000000000..39a94eb70b22 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */ +/* { dg-final { scan-assembler-times "vgf2p8affineqb" 13 } } */ + +#ifndef N +#define N 5 +#endif + +void +ubyteshiftl (unsigned char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] <<= N; +} + +void +ubyteshiftr (unsigned char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] >>= N; +} + +void +ubyteshiftl_mask (unsigned char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + if (a[i] & 1) + a[i] <<= N; +} + +void +sbyteshiftl (signed char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] <<= N; +} + +void +sbyteshiftr (signed char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] >>= N; +} + +void +ubyteror (unsigned char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] = a[i] << N | a[i] >> (8 - N); +} + +void +ubyterol (unsigned char *a, int len) +{ + int i; + for (i = 0; i < len; i++) + a[i] = a[i] >> N | a[i] << (8 - N); +} diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c new file mode 100644 index 000000000000..1027d69e2f22 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c @@ -0,0 +1,196 @@ +/* { dg-do run } */ +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3 -Wno-shift-count-negative" } */ + +#include <string.h> + +#ifndef N1 +#define N1 5 +#endif + +#ifndef N2 +#define N2 3 +#endif + +#ifndef N3 +#define N3 1 +#endif + +#ifndef N4 +#define N4 7 +#endif + +#ifndef N5 +#define N5 -3 +#endif + +#ifndef FILLER +#define FILLER 0xab +#endif + +#define FUNC(N) \ + void ubyteshiftl##N(unsigned char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] <<= N; \ + } \ + \ + void ubyteshiftr##N(unsigned char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] >>= N; \ + } \ + \ + void ubyteshiftl_mask##N(unsigned char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + if (a[i] & 1) \ + a[i] <<= N; \ + } \ + \ + void sbyteshiftl##N(signed char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] <<= N; \ + } \ + \ + void sbyteshiftr##N(signed char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] >>= N; \ + } \ + \ + void ubyteror##N(unsigned char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] = a[i] << N | a[i] >> (8-N); \ + } \ + \ + void ubyterol##N(unsigned char *a, int len) \ + { \ + int i; \ + for (i = 0; i < len; i++) \ + a[i] = a[i] >> N | a[i] << (8-N); \ + } \ + void ubyteshiftl##N##ref(unsigned char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] <<= N; \ + } \ + \ + void ubyteshiftr##N##ref(unsigned char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] >>= N; \ + } \ + \ + void ubyteshiftl_mask##N##ref(unsigned char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + if (a[i] & 1) \ + a[i] <<= N; \ + } \ + \ + void sbyteshiftl##N##ref(signed char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] <<= N; \ + } \ + \ + void sbyteshiftr##N##ref(signed char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] >>= N; \ + } \ + \ + void ubyteror##N##ref(unsigned char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] = a[i] << N | a[i] >> (8-N); \ + } \ + \ + void ubyterol##N##ref(unsigned char *a, int len) \ + { \ + int i; \ + _Pragma("GCC novector") \ + for (i = 0; i < len; i++) \ + a[i] = a[i] >> N | a[i] << (8-N); \ + } + +FUNC (N1) +FUNC (N2) +FUNC (N3) +FUNC (N4) +FUNC (N5) + +#define TEST(N, func) \ + memset (array, filler, len); \ + func##N (array, len); \ + memset (array2, filler, len); \ + func##N##ref (array2, len); \ + if (memcmp (array, array2, len)) __builtin_abort () + +int main () +{ + __builtin_cpu_init (); + if (!__builtin_cpu_supports ("gfni")) + return 0; + + const unsigned long len = 256; + char array[len], array2[len]; + unsigned char filler = FILLER; + + TEST (N1, ubyteshiftl); + TEST (N1, ubyteshiftl_mask); + TEST (N1, sbyteshiftl); + TEST (N1, sbyteshiftr); + TEST (N1, ubyteror); + TEST (N1, ubyterol); + + TEST (N2, ubyteshiftl); + TEST (N2, ubyteshiftl_mask); + TEST (N2, sbyteshiftl); + TEST (N2, sbyteshiftr); + TEST (N2, ubyteror); + TEST (N2, ubyterol); + + TEST (N3, ubyteshiftl); + TEST (N3, ubyteshiftl_mask); + TEST (N3, sbyteshiftl); + TEST (N3, sbyteshiftr); + TEST (N3, ubyteror); + TEST (N3, ubyterol); + + TEST (N4, ubyteshiftl); + TEST (N4, ubyteshiftl_mask); + TEST (N4, sbyteshiftl); + TEST (N4, sbyteshiftr); + TEST (N4, ubyteror); + TEST (N4, ubyterol); + + TEST (N5, ubyteshiftl); + TEST (N5, ubyteshiftl_mask); + TEST (N5, sbyteshiftl); + TEST (N5, sbyteshiftr); + TEST (N5, ubyteror); + TEST (N5, ubyterol); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c new file mode 100644 index 000000000000..856893d9a5fe --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c @@ -0,0 +1,87 @@ +/* { dg-do compile } */ +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */ +/* { dg-final { scan-assembler-times "vgf2p8affineqb" 12 } } */ + +/* Based on a test case from Andrew Pinski */ + +#ifndef N +#define N 5 +#endif + +void +ubyteshiftl (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? (a[i] | b[i]) << N : a[i]; + a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i]; + } +} + +void +ubyteshiftr (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? (a[i] | b[i]) >> N : a[i]; + a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i]; + } +} + +void +sbyteshiftl (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? (a[i] | b[i]) << N : a[i]; + a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i]; + } +} + +void +sbyteshiftr (signed char *restrict a, signed char *restrict b, signed char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? (a[i] | b[i]) >> N : a[i]; + a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i]; + } +} + +static inline unsigned char rol8(unsigned char v, int c) +{ + return (v >> c) | (v << (8-c)); +} + +static inline unsigned char ror8(unsigned char v, int c) +{ + return (v << c) | (v >> (8-c)); +} + +void +ubyterol (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? rol8(a[i] | b[i], N) : a[i]; + a[i] = (!c[i]) ? rol8(a[i] ^ b[i], N) : a[i]; + } +} + +void +ubyteror (unsigned char *restrict a, unsigned char *restrict b, unsigned char *restrict c, int len) +{ + int i; + for (i = 0; i < len; i++) + { + a[i] = c[i] ? ror8(a[i] | b[i], N) : a[i]; + a[i] = (!c[i]) ? ror8(a[i] ^ b[i], N) : a[i]; + } +} + + -- 2.49.0