In `store_bit_field_1`, when the value to be written in the bitfield and/or the bitfield itself have vector modes, non-canonical subregs are generated, like `(subreg:V4SI (reg:V8SI x) 0)`. If one them is a scalar, this happens only when the scalar mode is different than the vector's inner mode.
This patch tries to prevent this, using vec_set patterns when possible. Bootstrapped/regtested on AArch64 and x86_64. PR rtl-optimization/118873 gcc/ChangeLog: * expmed.cc (generate_vec_concat): New function. (store_bit_field_1): Check for cases where the value to be written and/or the bitfield have vector modes and try to generate the corresponding vec_set patterns instead of subregs. gcc/testsuite/ChangeLog: * gcc.target/i386/pr118873.c: New test. --- gcc/expmed.cc | 174 ++++++++++++++++++++++- gcc/testsuite/gcc.target/i386/pr118873.c | 33 +++++ 2 files changed, 200 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr118873.c diff --git a/gcc/expmed.cc b/gcc/expmed.cc index 8cf10d9c73bf..8c641f55b9c6 100644 --- a/gcc/expmed.cc +++ b/gcc/expmed.cc @@ -740,6 +740,42 @@ store_bit_field_using_insv (const extraction_insn *insv, rtx op0, return false; } +/* Helper function for store_bit_field_1, used in the case that the bitfield + and the destination are both vectors. It extracts the elements of OP from + LOWER_BOUND to UPPER_BOUND using a vec_select and uses a vec_concat to + concatenate the extracted elements with the VALUE. */ + +rtx +generate_vec_concat (machine_mode fieldmode, rtx op, rtx value, + HOST_WIDE_INT lower_bound, + HOST_WIDE_INT upper_bound) +{ + if (!VECTOR_MODE_P (fieldmode)) + return NULL_RTX; + + rtvec vec = rtvec_alloc (GET_MODE_NUNITS (fieldmode).to_constant ()); + machine_mode outermode = GET_MODE (op); + + for (HOST_WIDE_INT i = lower_bound; i < upper_bound; ++i) + RTVEC_ELT (vec, i) = GEN_INT (i); + rtx par = gen_rtx_PARALLEL (VOIDmode, vec); + rtx select = gen_rtx_VEC_SELECT (fieldmode, op, par); + if (BYTES_BIG_ENDIAN) + { + if (lower_bound > 0) + return gen_rtx_VEC_CONCAT (outermode, select, value); + else + return gen_rtx_VEC_CONCAT (outermode, value, select); + } + else + { + if (lower_bound > 0) + return gen_rtx_VEC_CONCAT (outermode, value, select); + else + return gen_rtx_VEC_CONCAT (outermode, select, value); + } +} + /* A subroutine of store_bit_field, with the same arguments. Return true if the operation could be implemented. @@ -778,18 +814,142 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, poly_uint64 bitnum, if (VECTOR_MODE_P (outermode) && !MEM_P (op0) && optab_handler (vec_set_optab, outermode) != CODE_FOR_nothing - && fieldmode == innermode - && known_eq (bitsize, GET_MODE_PRECISION (innermode)) && multiple_p (bitnum, GET_MODE_PRECISION (innermode), &pos)) { + /* Cases where the destination's inner mode is not equal to the + value's mode need special treatment. */ + class expand_operand ops[3]; enum insn_code icode = optab_handler (vec_set_optab, outermode); - create_fixed_operand (&ops[0], op0); - create_input_operand (&ops[1], value, innermode); - create_integer_operand (&ops[2], pos); - if (maybe_expand_insn (icode, 3, ops)) - return true; + /* Subreg expressions should operate on scalars only. Subregs on + vectors are not canonical. Extractions from vectors should use + vector operations instead. */ + bool is_non_canon_subreg = GET_CODE (value) == SUBREG + && VECTOR_MODE_P (fieldmode) + && !VECTOR_MODE_P ( + GET_MODE (SUBREG_REG (value))); + + /* If the value to be written is a memory expression or a non-canonical + scalar to vector subreg, don't try to generate a vec_set pattern. + Instead, fall back and try to generate an instruction without + touching the operands. */ + if (!MEM_P (value) && !is_non_canon_subreg) + { + if (VECTOR_MODE_P (fieldmode)) + { + /* Handle the case where both the value to be written and the + destination are vectors. */ + + HOST_WIDE_INT op_elem_num + = GET_MODE_NUNITS (outermode).to_constant (); + rtx concat_rtx = value; + rtx_insn *last_insn = get_last_insn (); + HOST_WIDE_INT index = 0; + /* If the store position is not at the start of the bitfield, + store the value by selecting the first pos elements of the + vector and then placing the value after them, using + a vec_concat. */ + if (pos.to_constant () > 0) + { + concat_rtx = generate_vec_concat (fieldmode, op0, value, 0, + pos.to_constant ()); + + index = pos.to_constant () + bitsize.to_constant () + / GET_MODE_UNIT_BITSIZE (outermode); + } + + /* Reconstruct the rest of the vector, after the value. */ + if (index < op_elem_num) + concat_rtx = generate_vec_concat (fieldmode, op0, concat_rtx, + index, op_elem_num); + + rtx_insn *set_insn = emit_insn (gen_rtx_SET (op0, concat_rtx)); + + if (recog_memoized (set_insn) >= 0) + return true; + else + delete_insns_since (last_insn); + } + else if (fieldmode != innermode) + { + /* Handle the case where the destination is a vector and + the value's mode is different than the vector's inner + mode. We have to treat the bitfield insertion differently + depending on which of those modes is wider than the other. */ + + if (known_gt (GET_MODE_SIZE (fieldmode), + GET_MODE_SIZE (innermode))) + { + /* If the value's mode is wider than the vector's inner + mode, extract a part from the value with size equal + to the vector's inner mode size and write it in the + appropriate position inside the vector, using a vec_set + pattern. Repeat, until the whole value is written. */ + + unsigned int curr_pos = 0; + bool failed = false; + rtx_insn *last_insn = get_last_insn (); + while (curr_pos < bitsize.to_constant ()) + { + + rtx subreg = gen_reg_rtx (innermode); + unsigned int innermode_size = GET_MODE_BITSIZE (innermode); + rtx bitfield + = extract_bit_field (value, innermode_size, + curr_pos, 0, NULL_RTX, innermode, + innermode, false, 0); + + store_bit_field_1 (subreg, innermode_size, 0, 0, + 0, innermode, bitfield, false, false, + false); + + HOST_WIDE_INT index + = pos.to_constant () + curr_pos / innermode_size; + create_fixed_operand (&ops[0], op0); + create_input_operand (&ops[1], subreg, innermode); + create_integer_operand (&ops[2], index); + if (!maybe_expand_insn (icode, 3, ops)) + { + failed = true; + break; + } + + curr_pos += innermode_size; + } + + if (!failed) + return true; + else + delete_insns_since (last_insn); + } + else if (known_lt (GET_MODE_SIZE (fieldmode), + GET_MODE_SIZE (innermode))) + { + /* If the value's mode is narrower than the vector's inner + mode, extend the value's mode to the vector's inner + mode and use a vec_set pattern for the insertion. */ + + rtx ext_value = gen_rtx_ZERO_EXTEND (innermode, value); + + create_fixed_operand (&ops[0], op0); + create_input_operand (&ops[1], ext_value, innermode); + create_integer_operand (&ops[2], pos); + if (maybe_expand_insn (icode, 3, ops)) + return true; + } + } + } + + if (fieldmode == innermode + && known_eq (bitsize, GET_MODE_PRECISION (innermode))) + { + create_fixed_operand (&ops[0], op0); + create_input_operand (&ops[1], value, innermode); + create_integer_operand (&ops[2], pos); + if (maybe_expand_insn (icode, 3, ops)) + return true; + } } /* If the target is a register, overwriting the entire object, or storing diff --git a/gcc/testsuite/gcc.target/i386/pr118873.c b/gcc/testsuite/gcc.target/i386/pr118873.c new file mode 100644 index 000000000000..3a07c7cc87f9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr118873.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -favoid-store-forwarding" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v8si __attribute__((vector_size(32))); + +v8si a; +v4si b; + +/* +** foo: +** ... +** vmovdqa a\(%rip\), %ymm0 +** vmovdqa b\(%rip\), %xmm1 +** vmovdqa %ymm0, \(%rdi\) +** vmovdqa 16\(%rdi\), %ymm0 +** vmovdqa %xmm1, 32\(%rdi\) +** vinserti128 \$0x1, %xmm1, %ymm0, %ymm0 +** vmovdqa %ymm0, a\(%rip\) +** vzeroupper +** ret +** ... +*/ +void foo (int *p) +{ + v8si aa = a; + v4si bb = b; + *(v8si *)p = a; + *(v4si *)(p + 8) = b; + a = *(v8si *)(p + 4); +} + -- 2.49.0