On Mon, 26 May 2025, Konstantinos Eleftheriou wrote:

> In `store_bit_field_1`, when the value to be written in the bitfield
> and/or the bitfield itself have vector modes, non-canonical subregs
> are generated, like `(subreg:V4SI (reg:V8SI x) 0)`. If one them is
> a scalar, this happens only when the scalar mode is different than the
> vector's inner mode.
> 
> This patch tries to prevent this, using vec_set patterns when
> possible.

I know almost nothing about this code, but why does the patch
fixup things after the fact rather than avoid generating the
SUBREG in the first place?

ISTR it also (unfortunately) depends on the target which forms
are considered canonical.

I'm also not sure you got endianess right for all possible
values of SUBREG_BYTE.  One more reason to not generate such
subreg in the first place but stick to vec_select/concat.

Richard.

> Bootstrapped/regtested on AArch64 and x86_64.
> 
>       PR rtl-optimization/118873
> 
> gcc/ChangeLog:
> 
>       * expmed.cc (generate_vec_concat): New function.
>       (store_bit_field_1): Check for cases where the value
>       to be written and/or the bitfield have vector modes
>       and try to generate the corresponding vec_set patterns
>       instead of subregs.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/i386/pr118873.c: New test.
> ---
>  gcc/expmed.cc                            | 174 ++++++++++++++++++++++-
>  gcc/testsuite/gcc.target/i386/pr118873.c |  33 +++++
>  2 files changed, 200 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr118873.c
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index 8cf10d9c73bf..8c641f55b9c6 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -740,6 +740,42 @@ store_bit_field_using_insv (const extraction_insn *insv, 
> rtx op0,
>    return false;
>  }
>  
> +/* Helper function for store_bit_field_1, used in the case that the bitfield
> +   and the destination are both vectors.  It extracts the elements of OP from
> +   LOWER_BOUND to UPPER_BOUND using a vec_select and uses a vec_concat to
> +   concatenate the extracted elements with the VALUE.  */
> +
> +rtx
> +generate_vec_concat (machine_mode fieldmode, rtx op, rtx value,
> +                  HOST_WIDE_INT lower_bound,
> +                  HOST_WIDE_INT upper_bound)
> +{
> +  if (!VECTOR_MODE_P (fieldmode))
> +    return NULL_RTX;
> +
> +  rtvec vec = rtvec_alloc (GET_MODE_NUNITS (fieldmode).to_constant ());
> +  machine_mode outermode = GET_MODE (op);
> +
> +  for (HOST_WIDE_INT i = lower_bound; i < upper_bound; ++i)
> +    RTVEC_ELT (vec, i) = GEN_INT (i);
> +  rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
> +  rtx select = gen_rtx_VEC_SELECT (fieldmode, op, par);
> +  if (BYTES_BIG_ENDIAN)
> +    {
> +      if (lower_bound > 0)
> +     return gen_rtx_VEC_CONCAT (outermode, select, value);
> +      else
> +     return gen_rtx_VEC_CONCAT (outermode, value, select);
> +    }
> +  else
> +    {
> +      if (lower_bound > 0)
> +     return gen_rtx_VEC_CONCAT (outermode, value, select);
> +      else
> +     return gen_rtx_VEC_CONCAT (outermode, select, value);
> +    }
> +}
> +
>  /* A subroutine of store_bit_field, with the same arguments.  Return true
>     if the operation could be implemented.
>  
> @@ -778,18 +814,142 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
> poly_uint64 bitnum,
>    if (VECTOR_MODE_P (outermode)
>        && !MEM_P (op0)
>        && optab_handler (vec_set_optab, outermode) != CODE_FOR_nothing
> -      && fieldmode == innermode
> -      && known_eq (bitsize, GET_MODE_PRECISION (innermode))
>        && multiple_p (bitnum, GET_MODE_PRECISION (innermode), &pos))
>      {
> +      /* Cases where the destination's inner mode is not equal to the
> +      value's mode need special treatment.  */
> +
>        class expand_operand ops[3];
>        enum insn_code icode = optab_handler (vec_set_optab, outermode);
>  
> -      create_fixed_operand (&ops[0], op0);
> -      create_input_operand (&ops[1], value, innermode);
> -      create_integer_operand (&ops[2], pos);
> -      if (maybe_expand_insn (icode, 3, ops))
> -     return true;
> +      /* Subreg expressions should operate on scalars only.  Subregs on
> +      vectors are not canonical.  Extractions from vectors should use
> +      vector operations instead.  */
> +      bool is_non_canon_subreg = GET_CODE (value) == SUBREG
> +                              && VECTOR_MODE_P (fieldmode)
> +                              && !VECTOR_MODE_P (
> +                                 GET_MODE (SUBREG_REG (value)));
> +
> +      /* If the value to be written is a memory expression or a non-canonical
> +      scalar to vector subreg, don't try to generate a vec_set pattern.
> +      Instead, fall back and try to generate an instruction without
> +      touching the operands.  */
> +      if (!MEM_P (value) && !is_non_canon_subreg)
> +      {
> +     if (VECTOR_MODE_P (fieldmode))
> +       {
> +         /* Handle the case where both the value to be written and the
> +            destination are vectors.  */
> +
> +         HOST_WIDE_INT op_elem_num
> +           = GET_MODE_NUNITS (outermode).to_constant ();
> +         rtx concat_rtx = value;
> +         rtx_insn *last_insn = get_last_insn ();
> +         HOST_WIDE_INT index = 0;
> +         /* If the store position is not at the start of the bitfield,
> +            store the value by selecting the first pos elements of the
> +            vector and then placing the value after them, using
> +            a vec_concat.  */
> +         if (pos.to_constant () > 0)
> +           {
> +             concat_rtx = generate_vec_concat (fieldmode, op0, value, 0,
> +                                               pos.to_constant ());
> +
> +             index = pos.to_constant () +  bitsize.to_constant ()
> +                     / GET_MODE_UNIT_BITSIZE (outermode);
> +           }
> +
> +         /* Reconstruct the rest of the vector, after the value.  */
> +         if (index < op_elem_num)
> +           concat_rtx = generate_vec_concat (fieldmode, op0, concat_rtx,
> +                                             index, op_elem_num);
> +
> +         rtx_insn *set_insn = emit_insn (gen_rtx_SET (op0, concat_rtx));
> +
> +         if (recog_memoized (set_insn) >= 0)
> +           return true;
> +         else
> +           delete_insns_since (last_insn);
> +       }
> +     else if (fieldmode != innermode)
> +       {
> +         /* Handle the case where the destination is a vector and
> +            the value's mode is different than the vector's inner
> +            mode.  We have to treat the bitfield insertion differently
> +            depending on which of those modes is wider than the other.  */
> +
> +         if (known_gt (GET_MODE_SIZE (fieldmode),
> +                       GET_MODE_SIZE (innermode)))
> +           {
> +             /* If the value's mode is wider than the vector's inner
> +                mode, extract a part from the value with size equal
> +                to the vector's inner mode size and write it in the
> +                appropriate position inside the vector, using a vec_set
> +                pattern.  Repeat, until the whole value is written.  */
> +
> +             unsigned int curr_pos = 0;
> +             bool failed = false;
> +             rtx_insn *last_insn = get_last_insn ();
> +             while (curr_pos < bitsize.to_constant ())
> +               {
> +
> +                 rtx subreg = gen_reg_rtx (innermode);
> +                 unsigned int innermode_size = GET_MODE_BITSIZE (innermode);
> +                 rtx bitfield
> +                   = extract_bit_field (value, innermode_size,
> +                                        curr_pos, 0, NULL_RTX, innermode,
> +                                        innermode, false, 0);
> +
> +                 store_bit_field_1 (subreg, innermode_size, 0, 0,
> +                                    0, innermode, bitfield, false, false,
> +                                    false);
> +
> +                 HOST_WIDE_INT index
> +                     = pos.to_constant () + curr_pos / innermode_size;
> +                 create_fixed_operand (&ops[0], op0);
> +                 create_input_operand (&ops[1], subreg, innermode);
> +                 create_integer_operand (&ops[2], index);
> +                 if (!maybe_expand_insn (icode, 3, ops))
> +                   {
> +                     failed = true;
> +                     break;
> +                   }
> +
> +                 curr_pos += innermode_size;
> +               }
> +
> +             if (!failed)
> +               return true;
> +             else
> +               delete_insns_since (last_insn);
> +           }
> +         else if (known_lt (GET_MODE_SIZE (fieldmode),
> +                            GET_MODE_SIZE (innermode)))
> +           {
> +             /* If the value's mode is narrower than the vector's inner
> +                mode, extend the value's mode to the vector's inner
> +                mode and use a vec_set pattern for the insertion.  */
> +
> +             rtx ext_value = gen_rtx_ZERO_EXTEND (innermode, value);
> +
> +             create_fixed_operand (&ops[0], op0);
> +             create_input_operand (&ops[1], ext_value, innermode);
> +             create_integer_operand (&ops[2], pos);
> +             if (maybe_expand_insn (icode, 3, ops))
> +               return true;
> +           }
> +       }
> +     }
> +
> +      if (fieldmode == innermode
> +       && known_eq (bitsize, GET_MODE_PRECISION (innermode)))
> +     {
> +       create_fixed_operand (&ops[0], op0);
> +       create_input_operand (&ops[1], value, innermode);
> +       create_integer_operand (&ops[2], pos);
> +       if (maybe_expand_insn (icode, 3, ops))
> +         return true;
> +     }
>      }
>  
>    /* If the target is a register, overwriting the entire object, or storing
> diff --git a/gcc/testsuite/gcc.target/i386/pr118873.c 
> b/gcc/testsuite/gcc.target/i386/pr118873.c
> new file mode 100644
> index 000000000000..3a07c7cc87f9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr118873.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -favoid-store-forwarding" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +
> +v8si a;
> +v4si b;
> +
> +/*
> +** foo:
> +**   ...
> +**   vmovdqa a\(%rip\), %ymm0
> +**   vmovdqa b\(%rip\), %xmm1
> +**   vmovdqa %ymm0, \(%rdi\)
> +**   vmovdqa 16\(%rdi\), %ymm0
> +**   vmovdqa %xmm1, 32\(%rdi\)
> +**   vinserti128     \$0x1, %xmm1, %ymm0, %ymm0
> +**   vmovdqa %ymm0, a\(%rip\)
> +**   vzeroupper
> +**   ret
> +**   ...
> +*/
> +void foo (int *p)
> +{
> +  v8si aa = a;
> +  v4si bb = b;
> +  *(v8si *)p = a;
> +  *(v4si *)(p + 8) = b;
> +  a = *(v8si *)(p + 4);
> +}
> +
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to