https://gcc.gnu.org/g:320c2ed4d2b4b007bab5ebf0078e6c730ad25d3e
commit r15-1852-g320c2ed4d2b4b007bab5ebf0078e6c730ad25d3e Author: YunQiang Su <s...@gcc.gnu.org> Date: Thu Jun 27 18:28:27 2024 +0800 MIPS: Support more cases with alien mode of SHF.DF Currently, we support the cases that strictly fit for the instructions. For example, for V16QImode, we only support shuffle like (0<=N0, N1, N2, N3<=3 here) N0, N1, N2, N3 N0+4 N1+4 N2+4, N3+4 N0+8 N1+8 N2+8, N3+8 N0+12 N1+12 N2+12, N3+12 While in fact we can support more cases to try use other SHF.DF instructions not strictly fitting the mode. 1) We can use SHF.H to support more cases for V16QImode: (M0/M1/M2/M3 are 0 or 2 or 4 or 6) M0 M0+1, M1, M1+1 M2 M2+1, M3, M3+1 M0+8 M0+9, M1+8, M1+9 M2+8 M2+9, M3+8, M3+9 2) We can use SHF.W to support some cases for V16QImode: (M0/M1/M2/M3 are 0 or 4 or 8 or 12) M0, M0+1, M0+2, M0+3 M1, M1+1, M1+2, M1+3 M2, M2+1, M2+2, M2+3 M3, M3+1, M3+2, M3+3 3) We can use SHF.W to support some cases for V8HImode: (M0/M1/M2/M3 are 0 or 2 or 4 or 6) M0, M0+1 M1, M1+1 M2, M2+1 M3, M3+1 4) We can also use SHF.W to swap the 2 parts of V2DF or V2DI. gcc * config/mips/mips-protos.h: New function mips_msa_shf_i8. * config/mips/mips-msa.md(MSA_WHB_W): Not used anymore; (msa_shf_<msafmt_f>): Use mips_msa_shf_i8. * config/mips/mips.cc(mips_const_vector_shuffle_set_p): Support more cases try to use alien mode instruction; (mips_msa_shf_i8): New function to get the correct MSA SHF instruction and IMM. Diff: --- gcc/config/mips/mips-msa.md | 35 +++++----- gcc/config/mips/mips-protos.h | 1 + gcc/config/mips/mips.cc | 149 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 15 deletions(-) diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md index 0081b688ce9..377c63f0d35 100644 --- a/gcc/config/mips/mips-msa.md +++ b/gcc/config/mips/mips-msa.md @@ -125,9 +125,6 @@ ;; Only floating-point modes. (define_mode_iterator FMSA [V2DF V4SF]) -;; Only used for immediate set shuffle elements instruction. -(define_mode_iterator MSA_WHB_W [V4SI V8HI V16QI V4SF]) - ;; The attribute gives the integer vector mode with same size. (define_mode_attr VIMODE [(V2DF "V2DI") @@ -2520,21 +2517,29 @@ (set_attr "mode" "<MODE>")]) (define_insn "msa_shf_<msafmt_f>" - [(set (match_operand:MSA_WHB_W 0 "register_operand" "=f") - (vec_select:MSA_WHB_W - (match_operand:MSA_WHB_W 1 "register_operand" "f") + [(set (match_operand:MSA 0 "register_operand" "=f") + (vec_select:MSA + (match_operand:MSA 1 "register_operand" "f") (match_operand 2 "par_const_vector_shf_set_operand" "")))] "ISA_HAS_MSA" { - HOST_WIDE_INT val = 0; - unsigned int i; - - /* We convert the selection to an immediate. */ - for (i = 0; i < 4; i++) - val |= INTVAL (XVECEXP (operands[2], 0, i)) << (2 * i); - - operands[2] = GEN_INT (val); - return "shf.<msafmt>\t%w0,%w1,%X2"; + HOST_WIDE_INT rval = mips_msa_shf_i8 (operands); + /* 0b11100100 means that there is no shf needed at all. This RTL + should be optimized out in some pass. */ + if ((rval & 0xff) == 0xe4) + gcc_unreachable (); + operands[2] = GEN_INT (rval & 0xff); + switch (rval & 0xff00) + { + default: gcc_unreachable (); + case 0x400: + return "shf.w\t%w0,%w1,%X2"; + case 0x200: + return "shf.h\t%w0,%w1,%X2"; + case 0x100: + return "shf.b\t%w0,%w1,%X2"; + } + gcc_unreachable (); } [(set_attr "type" "simd_shf") (set_attr "mode" "<MODE>")]) diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index 75f80984c03..90b4c87fdea 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -387,6 +387,7 @@ extern mulsidi3_gen_fn mips_mulsidi3_gen_fn (enum rtx_code); extern void mips_register_frame_header_opt (void); extern void mips_expand_vec_cond_expr (machine_mode, machine_mode, rtx *, bool); extern void mips_expand_vec_cmp_expr (rtx *); +extern HOST_WIDE_INT mips_msa_shf_i8 (rtx *); extern void mips_emit_speculation_barrier_function (void); diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc index 7d4791157d1..6c797b62164 100644 --- a/gcc/config/mips/mips.cc +++ b/gcc/config/mips/mips.cc @@ -2079,6 +2079,72 @@ mips_const_vector_shuffle_set_p (rtx op, machine_mode mode) int nsets = nunits / 4; int set = 0; int i, j; + int val[4]; + bool ok; + + /* We support swapping 2 Doubleword part with shf.w. */ + if (ISA_HAS_MSA && (mode == V2DFmode || mode == V2DImode)) + { + if (!IN_RANGE (INTVAL (XVECEXP (op, 0, 0)), 0, 1) + || !IN_RANGE (INTVAL (XVECEXP (op, 0, 1)), 0, 1)) + return false; + } + + if (ISA_HAS_MSA && mode == V16QImode) + { + /* We can use shf.w if the elements are in-order inner 32bit. */ + ok = true; + for (j = 0; j < 4; j++) + { + val[0] = INTVAL (XVECEXP (op, 0, j * 4)); + val[1] = INTVAL (XVECEXP (op, 0, j * 4 + 1)); + val[2] = INTVAL (XVECEXP (op, 0, j * 4 + 2)); + val[3] = INTVAL (XVECEXP (op, 0, j * 4 + 3)); + if (val[0] != val[1] - 1 + || val[1] != val[2] - 1 + || val[2] != val[3] - 1) + ok = false; + if (val[0] != 0 && val[0] != 4 && val[0] != 8 && val[0] != 12) + ok = false; + } + if (ok) + return ok; + + /* We can use shf.h if the elements are in order inner 16bit. */ + ok = true; + for (j = 0; j < 4; j++) + { + val[0] = INTVAL (XVECEXP (op, 0, j * 2)); + val[1] = INTVAL (XVECEXP (op, 0, j * 2 + 1)); + val[2] = INTVAL (XVECEXP (op, 0, j * 2 + 8)); + val[3] = INTVAL (XVECEXP (op, 0, j * 2 + 1 + 8)); + if (val[0] != val[1] - 1 || val[2] != val[3] - 1) + ok = false; + if (val[0] != val[2] - 8 || val[1] != val[3] - 8) + ok = false; + if (val[0] != 0 && val[0] != 2 && val[0] != 4 && val[0] != 6) + ok = false; + } + if (ok) + return ok; + } + + if (ISA_HAS_MSA && mode == V8HImode) + { + /* We can use shf.w if the elements are in-order inner 32bit. */ + ok = true; + for (j = 0; j < 4; j++) + { + val[0] = INTVAL (XVECEXP (op, 0, j * 2)); + val[1] = INTVAL (XVECEXP (op, 0, j * 2 + 1)); + if (val[0] != val[1] - 1) + ok = false; + if (val[0] != 0 && val[0] != 2 && val[0] != 4 && val[0] != 6) + ok = false; + } + if (ok) + return ok; + } /* Check if we have the same 4-element sets. */ for (j = 0; j < nsets; j++, set = 4 * j) @@ -22304,6 +22370,89 @@ mips_msa_vec_parallel_const_half (machine_mode mode, bool high_p) return gen_rtx_PARALLEL (VOIDmode, v); } +/* Construct and return i8 of SHF.df. No error will happen since tt has + been constrained by mips_const_vector_shuffle_set_p. + Return (IMM | (INSN << 8)): The range of IMM is [0, 0xFF]. + The INSN can be 0 (error)/1 (SHF.B)/2 (SHF.H)/4 (SHF.W). */ + +HOST_WIDE_INT +mips_msa_shf_i8 (rtx *operands) +{ + HOST_WIDE_INT rval = 0, val[16]; + unsigned int i; + machine_mode mode = GET_MODE (operands[0]); + int which_op = 0; + + /* We use shf.w to swap 2 doubleword part. */ + if (mode == V2DImode || mode == V2DFmode) + { + val[0] = INTVAL (XVECEXP (operands[2], 0, 0)); + val[1] = INTVAL (XVECEXP (operands[2], 0, 1)); + val[3] = val[1] == 0 ? 1 : 3; + val[2] = val[1] == 0 ? 0 : 2; + val[1] = val[0] == 0 ? 1 : 3; + val[0] = val[0] == 0 ? 0 : 2; + which_op = 4; + } + else if (mode == V16QImode) + { + for (i = 0; i < 16; i++) + val[i] = INTVAL (XVECEXP (operands[2], 0, i)); + if (val[1] - val[0] == 1 + && val[2] - val[1] == 1 + && val[3] - val[2] == 1) + { + which_op = 4; + val[0] = val[0] / 4; + val[1] = val[4] / 4; + val[2] = val[8] / 4; + val[3] = val[12] / 4; + } + else if (val[1] - val[0] == 1 + && val[3] - val[2] == 1) + { + which_op = 2; + val[0] = val[0] / 2; + val[1] = val[2] / 2; + val[2] = val[4] / 2; + val[3] = val[6] / 2; + } + else + which_op = 1; + } + else if (mode == V8HImode) + { + for (i = 0; i < 8; i++) + val[i] = INTVAL (XVECEXP (operands[2], 0, i)); + if (val[1] - val[0] == 1 + && val[3] - val[2] == 1 + && val[5] - val[4] == 1 + && val[7] - val[6] == 1) + { + which_op = 4; + val[0] = val[0] / 2; + val[1] = val[2] / 2; + val[2] = val[4] / 2; + val[3] = val[6] / 2; + } + else + which_op = 2; + } + else if (mode == V4SImode || mode == V4SFmode) + { + for (i = 0; i < 4; i++) + val[i] = INTVAL (XVECEXP (operands[2], 0, i)); + which_op = 4; + } + + /* We convert the selection to an immediate. */ + for (i = 0; i < 4; i++) + rval |= val[i] << (2 * i); + + rval |= (which_op << 8); + return rval; +} + /* A subroutine of mips_expand_vec_init, match constant vector elements. */ static inline bool